llvm/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
// stores that can be put together into vector-stores. Next, it attempts to
// construct vectorizable tree using the use-def chains. If a profitable tree
// was found, the SLP vectorizer performs vectorization on the tree.
//
// The pass is inspired by the work described in the paper:
//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <memory>
#include <optional>
#include <set>
#include <string>
#include <tuple>
#include <utility>

usingnamespacellvm;
usingnamespacellvm::PatternMatch;
usingnamespaceslpvectorizer;

#define SV_NAME
#define DEBUG_TYPE

STATISTIC(NumVectorInstructions, "Number of vector instructions generated");

DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
              "Controls which SLP graphs should be vectorized.");

static cl::opt<bool>
    RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
                        cl::desc("Run the SLP vectorization passes"));

static cl::opt<bool>
    SLPReVec("slp-revec", cl::init(false), cl::Hidden,
             cl::desc("Enable vectorization for wider vector utilization"));

static cl::opt<int>
    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                     cl::desc("Only vectorize if you gain more than this "
                              "number "));

static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
    "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
    cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
             "heuristics and makes vectorization decision via cost modeling."));

static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
                   cl::desc("Attempt to vectorize horizontal reductions"));

static cl::opt<bool> ShouldStartVectorizeHorAtStore(
    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
    cl::desc(
        "Attempt to vectorize horizontal reductions feeding into a store"));

static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
    cl::desc("Attempt to vectorize for this register size in bits"));

static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
    cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

/// Limits the size of scheduling regions in a block.
/// It avoid long compile times for _very_ large blocks where vector
/// instructions are spread over a wide range.
/// This limit is way higher than needed by real-world functions.
static cl::opt<int>
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
    cl::desc("Limit the size of the SLP scheduling region per block"));

static cl::opt<int> MinVectorRegSizeOption(
    "slp-min-reg-size", cl::init(128), cl::Hidden,
    cl::desc("Attempt to vectorize for this register size in bits"));

static cl::opt<unsigned> RecursionMaxDepth(
    "slp-recursion-max-depth", cl::init(12), cl::Hidden,
    cl::desc("Limit the recursion depth when building a vectorizable tree"));

static cl::opt<unsigned> MinTreeSize(
    "slp-min-tree-size", cl::init(3), cl::Hidden,
    cl::desc("Only vectorize small trees if they are fully vectorizable"));

// The maximum depth that the look-ahead score heuristic will explore.
// The higher this value, the higher the compilation time overhead.
static cl::opt<int> LookAheadMaxDepth(
    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
    cl::desc("The maximum look-ahead depth for operand reordering scores"));

// The maximum depth that the look-ahead score heuristic will explore
// when it probing among candidates for vectorization tree roots.
// The higher this value, the higher the compilation time overhead but unlike
// similar limit for operands ordering this is less frequently used, hence
// impact of higher value is less noticeable.
static cl::opt<int> RootLookAheadMaxDepth(
    "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
    cl::desc("The maximum look-ahead depth for searching best rooting option"));

static cl::opt<unsigned> MinProfitableStridedLoads(
    "slp-min-strided-loads", cl::init(2), cl::Hidden,
    cl::desc("The minimum number of loads, which should be considered strided, "
             "if the stride is > 1 or is runtime value"));

static cl::opt<unsigned> MaxProfitableLoadStride(
    "slp-max-stride", cl::init(8), cl::Hidden,
    cl::desc("The maximum stride, considered to be profitable."));

static cl::opt<bool>
    ViewSLPTree("view-slp-tree", cl::Hidden,
                cl::desc("Display the SLP trees with Graphviz"));

static cl::opt<bool> VectorizeNonPowerOf2(
    "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
    cl::desc("Try to vectorize with non-power-of-2 number of elements."));

// Limit the number of alias checks. The limit is chosen so that
// it has no negative effect on the llvm benchmarks.
static const unsigned AliasedCheckLimit =;

// Limit of the number of uses for potentially transformed instructions/values,
// used in checks to avoid compile-time explode.
static constexpr int UsesLimit =;

// Another limit for the alias checks: The maximum distance between load/store
// instructions where alias checks are done.
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance =;

/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
/// regions to be handled.
static const int MinScheduleRegionSize =;

/// Maximum allowed number of operands in the PHI nodes.
static const unsigned MaxPHINumOperands =;

/// Predicate for the element types that the SLP vectorizer supports.
///
/// The most important thing to filter here are types which are invalid in LLVM
/// vectors. We also filter target specific types which have absolutely no
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
/// avoids spending time checking the cost model and realizing that they will
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {}

/// Returns the type of the given value/instruction \p V. If it is store,
/// returns the type of its value operand, for Cmp - the types of the compare
/// operands and for insertelement - the type os the inserted operand.
/// Otherwise, just the type of the value is returned.
template <typename T> static Type *getValueType(T *V) {}

/// \returns the number of elements for Ty.
static unsigned getNumElements(Type *Ty) {}

/// \returns the vector type of ScalarTy based on vectorization factor.
static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {}

/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
/// which forms type, which splits by \p TTI into whole vector types during
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
                                              Type *Ty, unsigned Sz) {}

/// Returns the number of elements of the given type \p Ty, not greater than \p
/// Sz, which forms type, which splits by \p TTI into whole vector types during
/// legalization.
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
                                   unsigned Sz) {}

static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
                                                   SmallVectorImpl<int> &Mask) {}

/// \returns the number of groups of shufflevector
/// A group has the following features
/// 1. All of value in a group are shufflevector.
/// 2. The mask of all shufflevector is isExtractSubvectorMask.
/// 3. The mask of all shufflevector uses all of the elements of the source.
/// e.g., it is 1 group (%0)
/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
///    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
///    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
/// it is 2 groups (%3 and %4)
/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
/// it is 0 group
/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {}

/// \returns a shufflevector mask which is used to vectorize shufflevectors
/// e.g.,
/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
/// the result is
/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {}

/// \returns True if the value is a constant (but not globals/constant
/// expressions).
static bool isConstant(Value *V) {}

/// Checks if \p V is one of vector-like instructions, i.e. undef,
/// insertelement/extractelement with constant indices for fixed vector type or
/// extractvalue instruction.
static bool isVectorLikeInstWithConstOps(Value *V) {}

/// Returns power-of-2 number of elements in a single register (part), given the
/// total number of elements \p Size and number of registers (parts) \p
/// NumParts.
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {}

/// Returns correct remaining number of elements, considering total amount \p
/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
/// and current register (part) \p Part.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
                            unsigned Part) {}

#if !defined(NDEBUG)
/// Print a short descriptor of the instruction bundle suitable for debug output.
static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
  std::string Result;
  raw_string_ostream OS(Result);
  if (Idx >= 0)
    OS << "Idx: " << Idx << ", ";
  OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
  return Result;
}
#endif

/// \returns true if all of the instructions in \p VL are in the same block or
/// false otherwise.
static bool allSameBlock(ArrayRef<Value *> VL) {}

/// \returns True if all of the values in \p VL are constants (but not
/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {}

/// \returns True if all of the values in \p VL are identical or some of them
/// are UndefValue.
static bool isSplat(ArrayRef<Value *> VL) {}

/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
static bool isCommutative(Instruction *I) {}

template <typename T>
static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
                                                     unsigned Offset) {}

/// \returns inserting or extracting index of InsertElement, ExtractElement or
/// InsertValue instruction, using Offset as base offset for index.
/// \returns std::nullopt if the index is not an immediate.
static std::optional<unsigned> getElementIndex(const Value *Inst,
                                               unsigned Offset = 0) {}

namespace {
/// Specifies the way the mask should be analyzed for undefs/poisonous elements
/// in the shuffle mask.
enum class UseMask {};
} // namespace

/// Prepares a use bitset for the given mask either for the first argument or
/// for the second.
static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
                                   UseMask MaskArg) {}

/// Checks if the given value is actually an undefined constant vector.
/// Also, if the \p UseMask is not empty, tries to check if the non-masked
/// elements actually mask the insertelement buildvector, if any.
template <bool IsPoisonOnly = false>
static SmallBitVector isUndefVector(const Value *V,
                                    const SmallBitVector &UseMask = {}

/// Checks if the vector of instructions can be represented as a shuffle, like:
/// %x0 = extractelement <4 x i8> %x, i32 0
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
/// %x0x0 = mul i8 %x0, %x0
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
/// ret <4 x i8> %ins4
/// can be transformed into:
/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
///                                                         i32 6>
/// %2 = mul <4 x i8> %1, %1
/// ret <4 x i8> %2
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// ShuffleVectorInst/getShuffleCost?
static std::optional<TargetTransformInfo::ShuffleKind>
isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {}

/// \returns True if Extract{Value,Element} instruction extracts element Idx.
static std::optional<unsigned> getExtractIndex(Instruction *E) {}

namespace {

/// Main data required for vectorization of instructions.
struct InstructionsState {};

} // end anonymous namespace

/// \returns true if \p Opcode is allowed as part of the main/alternate
/// instruction for SLP vectorization.
///
/// Example of unsupported opcode is SDIV that can potentially cause UB if the
/// "shuffled out" lane would result in division by zero.
static bool isValidForAlternation(unsigned Opcode) {}

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                       const TargetLibraryInfo &TLI);

/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
/// compatible instructions or constants, or just some other regular values.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
                                Value *Op1, const TargetLibraryInfo &TLI) {}

/// \returns true if a compare instruction \p CI has similar "look" and
/// same predicate as \p BaseCI, "as is" or with its operands and predicate
/// swapped, false otherwise.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
                               const TargetLibraryInfo &TLI) {}

/// \returns analysis of the Instructions in \p VL described in
/// InstructionsState, the Opcode that we suppose the whole list
/// could be vectorized even if its structure is diverse.
static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
                                       const TargetLibraryInfo &TLI) {}

/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {}

/// \returns True if in-tree use also needs extract. This refers to
/// possible scalar operand in vectorized instruction.
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
                                        TargetLibraryInfo *TLI) {}

/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I) {}

/// \returns True if the instruction is not a volatile or atomic load/store.
static bool isSimple(Instruction *I) {}

/// Shuffles \p Mask in accordance with the given \p SubMask.
/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
/// one but two input vectors.
static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
                    bool ExtendingManyInputs = false) {}

/// Order may have elements assigned special value (size) which is out of
/// bounds. Such indices only appear on places which correspond to undef values
/// (see canReuseExtract for details) and used in order to avoid undef values
/// have effect on operands ordering.
/// The first loop below simply finds all unused indices and then the next loop
/// nest assigns these indices for undef values positions.
/// As an example below Order has two undef positions and they have assigned
/// values 3 and 7 respectively:
/// before:  6 9 5 4 9 2 1 0
/// after:   6 3 5 4 7 2 1 0
static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {}

/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
/// Opcode1.
static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
                                      unsigned Opcode1) {}

namespace llvm {

static void inversePermutation(ArrayRef<unsigned> Indices,
                               SmallVectorImpl<int> &Mask) {}

/// Reorders the list of scalars in accordance with the given \p Mask.
static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
                           ArrayRef<int> Mask) {}

/// Checks if the provided value does not require scheduling. It does not
/// require scheduling if this is not an instruction or it is an instruction
/// that does not read/write memory and all operands are either not instructions
/// or phi nodes or instructions from different blocks.
static bool areAllOperandsNonInsts(Value *V) {}

/// Checks if the provided value does not require scheduling. It does not
/// require scheduling if this is not an instruction or it is an instruction
/// that does not read/write memory and all users are phi nodes or instructions
/// from the different blocks.
static bool isUsedOutsideBlock(Value *V) {}

/// Checks if the specified value does not require scheduling. It does not
/// require scheduling if all operands and all users do not need to be scheduled
/// in the current basic block.
static bool doesNotNeedToBeScheduled(Value *V) {}

/// Checks if the specified array of instructions does not require scheduling.
/// It is so if all either instructions have operands that do not require
/// scheduling or their users do not require scheduling since they are phis or
/// in other basic blocks.
static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {}

/// Returns true if widened type of \p Ty elements with size \p Sz represents
/// full vector type, i.e. adding extra element results in extra parts upon type
/// legalization.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
                                     unsigned Sz) {}

namespace slpvectorizer {

/// Bottom Up SLP Vectorizer.
class BoUpSLP {};

} // end namespace slpvectorizer

template <> struct GraphTraits<BoUpSLP *> {};

template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {};

} // end namespace llvm

BoUpSLP::~BoUpSLP() {}

/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
/// contains original mask for the scalars reused in the node. Procedure
/// transform this mask in accordance with the given \p Mask.
static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {}

/// Reorders the given \p Order according to the given \p Mask. \p Order - is
/// the original order of the scalars. Procedure transforms the provided order
/// in accordance with the given \p Mask. If the resulting \p Order is just an
/// identity order, \p Order is cleared.
static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
                         bool BottomOrder = false) {}

std::optional<BoUpSLP::OrdersType>
BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {}

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
                                  const TargetLibraryInfo &TLI,
                                  bool CompareOpcodes = true) {}

/// Calculates minimal alignment as a common alignment.
template <typename T>
static Align computeCommonAlignment(ArrayRef<Value *> VL) {}

/// Check if \p Order represents reverse order.
static bool isReverseOrder(ArrayRef<unsigned> Order) {}

/// Checks if the provided list of pointers \p Pointers represents the strided
/// pointers for type ElemTy. If they are not, std::nullopt is returned.
/// Otherwise, if \p Inst is not specified, just initialized optional value is
/// returned to show that the pointers represent strided pointers. If \p Inst
/// specified, the runtime stride is materialized before the given \p Inst.
/// \returns std::nullopt if the pointers are not pointers with the runtime
/// stride, nullptr or actual stride value, otherwise.
static std::optional<Value *>
calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
                  const DataLayout &DL, ScalarEvolution &SE,
                  SmallVectorImpl<unsigned> &SortedIndices,
                  Instruction *Inst = nullptr) {}

static std::pair<InstructionCost, InstructionCost>
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
            Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
            Type *ScalarTy, VectorType *VecTy);

/// Returns the cost of the shuffle instructions with the given \p Kind, vector
/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
/// subvector pattern.
static InstructionCost
getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
               VectorType *Tp, ArrayRef<int> Mask = {}

BoUpSLP::LoadsState
BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                           SmallVectorImpl<unsigned> &Order,
                           SmallVectorImpl<Value *> &PointerOps,
                           unsigned *BestVF, bool TryRecursiveCheck) const {}

static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
                                   const DataLayout &DL, ScalarEvolution &SE,
                                   SmallVectorImpl<unsigned> &SortedIndices) {}

std::optional<BoUpSLP::OrdersType>
BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {}

/// Check if two insertelement instructions are from the same buildvector.
static bool areTwoInsertFromSameBuildVector(
    InsertElementInst *VU, InsertElementInst *V,
    function_ref<Value *(InsertElementInst *)> GetBaseOperand) {}

std::optional<BoUpSLP::OrdersType>
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {}

/// Checks if the given mask is a "clustered" mask with the same clusters of
/// size \p Sz, which are not identity submasks.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
                                               unsigned Sz) {}

void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {}

static void combineOrders(MutableArrayRef<unsigned> Order,
                          ArrayRef<unsigned> SecondaryOrder) {}

void BoUpSLP::reorderTopToBottom() {}

bool BoUpSLP::canReorderOperands(
    TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
    ArrayRef<TreeEntry *> ReorderableGathers,
    SmallVectorImpl<TreeEntry *> &GatherOps) {}

void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {}

Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {}

void BoUpSLP::buildExternalUses(
    const ExtraValueToDebugLocsMap &ExternallyUsedValues) {}

DenseMap<Value *, SmallVector<StoreInst *>>
BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {}

bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
                            OrdersType &ReorderIndices) const {}

#ifndef NDEBUG
LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
  for (unsigned Idx : Order)
    dbgs() << Idx << ", ";
  dbgs() << "\n";
}
#endif

SmallVector<BoUpSLP::OrdersType, 1>
BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {}

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                        const SmallDenseSet<Value *> &UserIgnoreLst) {}

void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {}

/// Tries to find subvector of loads and builds new vector of only loads if can
/// be profitable.
static void gatherPossiblyVectorizableLoads(
    const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
    ScalarEvolution &SE, const TargetTransformInfo &TTI,
    SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
    bool AddNew = true) {}

void BoUpSLP::tryToVectorizeGatheredLoads(
    ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {}

/// \return true if the specified list of values has only one instruction that
/// requires scheduling, false otherwise.
#ifndef NDEBUG
static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
  Value *NeedsScheduling = nullptr;
  for (Value *V : VL) {
    if (doesNotNeedToBeScheduled(V))
      continue;
    if (!NeedsScheduling) {
      NeedsScheduling = V;
      continue;
    }
    return false;
  }
  return NeedsScheduling;
}
#endif

/// Generates key/subkey pair for the given value to provide effective sorting
/// of the values and better detection of the vectorizable values sequences. The
/// keys/subkeys can be used for better sorting of the values themselves (keys)
/// and in values subgroups (subkeys).
static std::pair<size_t, size_t> generateKeySubkey(
    Value *V, const TargetLibraryInfo *TLI,
    function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
    bool AllowAlternate) {}

/// Checks if the specified instruction \p I is an alternate operation for
/// the given \p MainOp and \p AltOp instructions.
static bool isAlternateInstruction(const Instruction *I,
                                   const Instruction *MainOp,
                                   const Instruction *AltOp,
                                   const TargetLibraryInfo &TLI);

bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
                                       ArrayRef<Value *> VL) const {}

BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
    InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {}

namespace {
/// Allows to correctly handle operands of the phi nodes based on the \p Main
/// PHINode order of incoming basic blocks/values.
class PHIHandler {};
} // namespace

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                            const EdgeInfo &UserTreeIdx) {}

unsigned BoUpSLP::canMapToVector(Type *T) const {}

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
                              SmallVectorImpl<unsigned> &CurrentOrder,
                              bool ResizeAllowed) const {}

bool BoUpSLP::areAllUsersVectorized(
    Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {}

static std::pair<InstructionCost, InstructionCost>
getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
                   TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
                   ArrayRef<Type *> ArgTys) {}

void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
    const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
    SmallVectorImpl<Value *> *OpScalars,
    SmallVectorImpl<Value *> *AltScalars) const {}

static bool isAlternateInstruction(const Instruction *I,
                                   const Instruction *MainOp,
                                   const Instruction *AltOp,
                                   const TargetLibraryInfo &TLI) {}

TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {}

namespace {
/// The base class for shuffle instruction emission and shuffle cost estimation.
class BaseShuffleAnalysis {};
} // namespace

/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
static std::pair<InstructionCost, InstructionCost>
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
            Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
            Type *ScalarTy, VectorType *VecTy) {}

void BoUpSLP::transformNodes() {}

/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
/// required. Otherwise, the shuffle instruction emission is delayed till the
/// end of the process, to reduce the number of emitted instructions and further
/// analysis/transformations.
class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {};

const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
                                                   unsigned Idx) const {}

TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {}

/// Builds the arguments types vector for the given call instruction with the
/// given \p ID for the specified vector factor.
static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
                                                  const Intrinsic::ID ID,
                                                  const unsigned VF,
                                                  unsigned MinBW) {}

InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
                      SmallPtrSetImpl<Value *> &CheckedExtracts) {}

bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {}

static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
                                       TargetTransformInfo *TTI,
                                       bool MustMatchOrInst) {}

bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {}

bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {}

bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {}

InstructionCost BoUpSLP::getSpillCost() const {}

/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
/// buildvector sequence.
static bool isFirstInsertElement(const InsertElementInst *IE1,
                                 const InsertElementInst *IE2) {}

namespace {
/// Returns incoming Value *, if the requested type is Value * too, or a default
/// value, otherwise.
struct ValueSelect {};
} // namespace

/// Does the analysis of the provided shuffle masks and performs the requested
/// actions on the vectors with the given shuffle masks. It tries to do it in
/// several steps.
/// 1. If the Base vector is not undef vector, resizing the very first mask to
/// have common VF and perform action for 2 input vectors (including non-undef
/// Base). Other shuffle masks are combined with the resulting after the 1 stage
/// and processed as a shuffle of 2 elements.
/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
/// action only for 1 vector with the given mask, if it is not the identity
/// mask.
/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
/// vectors, combing the masks properly between the steps.
template <typename T>
static T *performExtractsShuffleAction(
    MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
    function_ref<unsigned(T *)> GetVF,
    function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
    function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {}

namespace {
/// Data type for handling buildvector sequences with the reused scalars from
/// other tree entries.
template <typename T> struct ShuffledInsertData {};
} // namespace

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {}

/// Tries to find extractelement instructions with constant indices from fixed
/// vector type and gather such instructions into a bunch, which highly likely
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
/// successful, the matched scalars are replaced by poison values in \p VL for
/// future analysis.
std::optional<TTI::ShuffleKind>
BoUpSLP::tryToGatherSingleRegisterExtractElements(
    MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {}

/// Tries to find extractelement instructions with constant indices from fixed
/// vector type and gather such instructions into a bunch, which highly likely
/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
/// successful, the matched scalars are replaced by poison values in \p VL for
/// future analysis.
SmallVector<std::optional<TTI::ShuffleKind>>
BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
                                    SmallVectorImpl<int> &Mask,
                                    unsigned NumParts) const {}

std::optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledSingleRegisterEntry(
    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {}

SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
BoUpSLP::isGatherShuffledEntry(
    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
    bool ForOrder) {}

InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
                                       Type *ScalarTy) const {}

// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
                                             SmallVectorImpl<Value *> &Left,
                                             SmallVectorImpl<Value *> &Right,
                                             const BoUpSLP &R) {}

Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {}

void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {}

Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {}

/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
/// required. Otherwise, the shuffle instruction emission is delayed till the
/// end of the process, to reduce the number of emitted instructions and further
/// analysis/transformations.
/// The class also will look through the previously emitted shuffle instructions
/// and properly mark indices in mask as undef.
/// For example, given the code
/// \code
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
/// \endcode
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
/// look through %s1 and %s2 and emit
/// \code
/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
/// \endcode
/// instead.
/// If 2 operands are of different size, the smallest one will be resized and
/// the mask recalculated properly.
/// For example, given the code
/// \code
/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
/// \endcode
/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
/// look through %s1 and %s2 and emit
/// \code
/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
/// \endcode
/// instead.
class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {};

BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
                                                         unsigned NodeIdx) {}

Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
                                 bool PostponedPHIs) {}

template <typename BVTy, typename ResTy, typename... Args>
ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
                                  Args &...Params) {}

Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
                                  bool PostponedPHIs) {}

Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {}Value *BoUpSLP::vectorizeTree() {}Value *
BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
                       Instruction *ReductionRoot) {}void BoUpSLP::optimizeGatherSequence() {}BoUpSLP::ScheduleData *
BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {}std::optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
                                            const InstructionsState &S) {}void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
                                                Value *OpValue) {}BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {}bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
    Value *V, const InstructionsState &S) {}void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
                                                Instruction *ToI,
                                                ScheduleData *PrevLoadStore,
                                                ScheduleData *NextLoadStore) {}void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
                                                     bool InsertInReadyList,
                                                     BoUpSLP *SLP) {}void BoUpSLP::BlockScheduling::resetSchedule() {}void BoUpSLP::scheduleBlock(BlockScheduling *BS) {}unsigned BoUpSLP::getVectorElementSize(Value *V) {}bool BoUpSLP::collectValuesToDemote(
    const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
    SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
    unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
    bool IsTruncRoot) const {}static RecurKind getRdxKind(Value *V)void BoUpSLP::computeMinimumValueSizes() {}PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {}bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                TargetTransformInfo *TTI_,
                                TargetLibraryInfo *TLI_, AAResults *AA_,
                                LoopInfo *LI_, DominatorTree *DT_,
                                AssumptionCache *AC_, DemandedBits *DB_,
                                OptimizationRemarkEmitter *ORE_) {}std::optional<bool>
SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
                                       unsigned Idx, unsigned MinVF,
                                       unsigned &Size) {}static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
                           bool First) {}bool SLPVectorizerPass::vectorizeStores(
    ArrayRef<StoreInst *> Stores, BoUpSLP &R,
    DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
        &Visited) {}void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {}bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                           bool MaxVFOnly) {}bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {}class HorizontalReduction {}static RecurKind getRdxKind(Value *V) {}static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {}static void findBuildAggregate_rec(Instruction *LastInsertInst,
                                   TargetTransformInfo *TTI,
                                   SmallVectorImpl<Value *> &BuildVectorOpds,
                                   SmallVectorImpl<Value *> &InsertElts,
                                   unsigned OperandOffset) {}static bool findBuildAggregate(Instruction *LastInsertInst,
                               TargetTransformInfo *TTI,
                               SmallVectorImpl<Value *> &BuildVectorOpds,
                               SmallVectorImpl<Value *> &InsertElts) {}static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
                                      BasicBlock *ParentBB, LoopInfo *LI) {}static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {}static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
                                                 Instruction *Root) {}static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {}static bool isReductionCandidate(Instruction *I) {}bool SLPVectorizerPass::vectorizeHorReduction(
    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
    SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {}bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
                                                 BasicBlock *BB, BoUpSLP &R,
                                                 TargetTransformInfo *TTI) {}bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
                                       BoUpSLP &R) {}bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                 BasicBlock *BB, BoUpSLP &R,
                                                 bool MaxVFOnly) {}bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                   BasicBlock *BB, BoUpSLP &R,
                                                   bool MaxVFOnly) {}template <typename T>
static bool tryToVectorizeSequence(
    SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
    function_ref<bool(T *, T *)> AreCompatible,
    function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
    bool MaxVFOnly, BoUpSLP &R) {}template <bool IsCompatibility>
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
                       const DominatorTree &DT) {}template <typename ItT>
bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
                                          BasicBlock *BB, BoUpSLP &R) {}bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
                                         BasicBlock *BB, BoUpSLP &R) {}bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {}bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {}bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {}