llvm/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "AArch64TargetTransformInfo.h"
#include "AArch64ExpandImm.h"
#include "AArch64PerfectShuffle.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <optional>
usingnamespacellvm;
usingnamespacellvm::PatternMatch;

#define DEBUG_TYPE

static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
                                               cl::init(true), cl::Hidden);

static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
                                           cl::Hidden);

static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                            cl::init(10), cl::Hidden);

static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
                                                  cl::init(15), cl::Hidden);

static cl::opt<unsigned>
    NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
                               cl::Hidden);

static cl::opt<unsigned> CallPenaltyChangeSM(
    "call-penalty-sm-change", cl::init(5), cl::Hidden,
    cl::desc(
        "Penalty of calling a function that requires a change to PSTATE.SM"));

static cl::opt<unsigned> InlineCallPenaltyChangeSM(
    "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
    cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));

static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
                                           cl::init(true), cl::Hidden);

static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
                                      cl::init(true), cl::Hidden);

// A complete guess as to a reasonable cost.
static cl::opt<unsigned>
    BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
                    cl::desc("The cost of a histcnt instruction"));

namespace {
class TailFoldingOption {};
} // namespace

TailFoldingOption TailFoldingOptionLoc;

cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
    "sve-tail-folding",
    cl::desc(
        "Control the use of vectorisation using tail-folding for SVE where the"
        " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
        "\ndisabled      (Initial) No loop types will vectorize using "
        "tail-folding"
        "\ndefault       (Initial) Uses the default tail-folding settings for "
        "the target CPU"
        "\nall           (Initial) All legal loop types will vectorize using "
        "tail-folding"
        "\nsimple        (Initial) Use tail-folding for simple loops (not "
        "reductions or recurrences)"
        "\nreductions    Use tail-folding for loops containing reductions"
        "\nnoreductions  Inverse of above"
        "\nrecurrences   Use tail-folding for loops containing fixed order "
        "recurrences"
        "\nnorecurrences Inverse of above"
        "\nreverse       Use tail-folding for loops requiring reversed "
        "predicates"
        "\nnoreverse     Inverse of above"),
    cl::location(TailFoldingOptionLoc));

// Experimental option that will only be fully functional when the
// code-generator is changed to use SVE instead of NEON for all fixed-width
// operations.
static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
    "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

// Experimental option that will only be fully functional when the cost-model
// and code-generator have been changed to avoid using scalable vector
// instructions that are not legal in streaming SVE mode.
static cl::opt<bool> EnableScalableAutovecInStreamingMode(
    "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);

static bool isSMEABIRoutineCall(const CallInst &CI) {}

/// Returns true if the function has explicit operations that can only be
/// lowered using incompatible instructions for the selected mode. This also
/// returns true if the function F may use or modify ZA state.
static bool hasPossibleIncompatibleOps(const Function *F) {}

bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                         const Function *Callee) const {}

bool AArch64TTIImpl::areTypesABICompatible(
    const Function *Caller, const Function *Callee,
    const ArrayRef<Type *> &Types) const {}

unsigned
AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
                                     unsigned DefaultCallPenalty) const {}

bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
    TargetTransformInfo::RegisterKind K) const {}

/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {}

/// Calculate the cost of materializing the given constant.
InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                              TTI::TargetCostKind CostKind) {}

InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                  const APInt &Imm, Type *Ty,
                                                  TTI::TargetCostKind CostKind,
                                                  Instruction *Inst) {}

InstructionCost
AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
                                    const APInt &Imm, Type *Ty,
                                    TTI::TargetCostKind CostKind) {}

TargetTransformInfo::PopcntSupportKind
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {}

static bool isUnpackedVectorVT(EVT VecVT) {}

static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {}

InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                      TTI::TargetCostKind CostKind) {}

/// The function will remove redundant reinterprets casting in the presence
/// of the control flow
static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
                                                   IntrinsicInst &II) {}

// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
// => (binop (pred) (from_svbool _) (from_svbool _))
//
// The above transformation eliminates a `to_svbool` in the predicate
// operand of bitwise operation `binop` by narrowing the vector width of
// the operation. For example, it would convert a `<vscale x 16 x i1>
// and` into a `<vscale x 4 x i1> and`. This is profitable because
// to_svbool must zero the new lanes during widening, whereas
// from_svbool is free.
static std::optional<Instruction *>
tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {}

static bool isAllActivePredicate(Value *Pred) {}

// Simplify unary operation where predicate has all inactive lanes by replacing
// instruction with its operand
static std::optional<Instruction *>
instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II,
                              bool hasInactiveVector) {}

// Simplify unary operation where predicate has all inactive lanes or
// replace unused first operand with undef when all lanes are active
static std::optional<Instruction *>
instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) {}

// Erase unary operation where predicate has all inactive lanes
static std::optional<Instruction *>
instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
                                 int PredPos) {}

// Simplify operation where predicate has all inactive lanes by replacing
// instruction with zeroed object
static std::optional<Instruction *>
instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
                                                      IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
                                                      IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
                                                       IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
                                                        IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
                                                       IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
                                                           IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
                                                     IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {}

static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
                                                        IntrinsicInst &II) {}

template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
static std::optional<Instruction *>
instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
                                  bool MergeIntoAddendOp) {}

static std::optional<Instruction *>
instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {}

static std::optional<Instruction *>
instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {}

static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {}

static std::optional<Instruction *>
instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {}

// Canonicalise operations that take an all active predicate (e.g. sve.add ->
// sve.add_u).
static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
                                                            Intrinsic::ID IID) {}

// Simplify operations where predicate has all inactive lanes or try to replace
// with _u form when all lanes are active
static std::optional<Instruction *>
instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
                            Intrinsic::ID IID) {}

static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
                                                            IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
                                                            IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
                                                            IntrinsicInst &II,
                                                            Intrinsic::ID IID) {}

static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
                                                         IntrinsicInst &II) {}
static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
                                                      IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
                                                       IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
                                                      IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *>
instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
                                                       IntrinsicInst &II) {}

bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {}

// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
// to dupqlane(f64(C)) where C is A concatenated with B
static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
                                                           IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
                                                        IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
                                                        IntrinsicInst &II) {}

static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
                                                       IntrinsicInst &II) {}

std::optional<Instruction *>
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                     IntrinsicInst &II) const {}

std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) const {}

bool AArch64TTIImpl::enableScalableVectorization() const {}

TypeSize
AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {}

bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                           ArrayRef<const Value *> Args,
                                           Type *SrcOverrideTy) {}

// s/urhadd instructions implement the following pattern, making the
// extends free:
//   %x = add ((zext i8 -> i16), 1)
//   %y = (zext i8 -> i16)
//   trunc i16 (lshr (add %x, %y), 1) -> i8
//
bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
                                        Type *Src) {}

InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                 Type *Src,
                                                 TTI::CastContextHint CCH,
                                                 TTI::TargetCostKind CostKind,
                                                 const Instruction *I) {}

InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
                                                         Type *Dst,
                                                         VectorType *VecTy,
                                                         unsigned Index) {}

InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
                                               TTI::TargetCostKind CostKind,
                                               const Instruction *I) {}

InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
                                                         Type *Val,
                                                         unsigned Index,
                                                         bool HasRealUse) {}

InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                   TTI::TargetCostKind CostKind,
                                                   unsigned Index, Value *Op0,
                                                   Value *Op1) {}

InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                   Type *Val,
                                                   TTI::TargetCostKind CostKind,
                                                   unsigned Index) {}

InstructionCost AArch64TTIImpl::getScalarizationOverhead(
    VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
    TTI::TargetCostKind CostKind) {}

InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
    ArrayRef<const Value *> Args,
    const Instruction *CxtI) {}

InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
                                                          ScalarEvolution *SE,
                                                          const SCEV *Ptr) {}

InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
    unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
    TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
    TTI::OperandValueInfo Op2Info, const Instruction *I) {}

AArch64TTIImpl::TTI::MemCmpExpansionOptions
AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {}

bool AArch64TTIImpl::prefersVectorizedAddressing() const {}

InstructionCost
AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                      Align Alignment, unsigned AddressSpace,
                                      TTI::TargetCostKind CostKind) {}

// This function returns gather/scatter overhead either from
// user-provided value or specialized values per-target from \p ST.
static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
                                            const AArch64Subtarget *ST) {}

InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {}

bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {}

InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                                MaybeAlign Alignment,
                                                unsigned AddressSpace,
                                                TTI::TargetCostKind CostKind,
                                                TTI::OperandValueInfo OpInfo,
                                                const Instruction *I) {}

InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) {}

InstructionCost
AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {}

unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {}

// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources.  We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
// many strided loads.
static void
getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                              TargetTransformInfo::UnrollingPreferences &UP) {}

void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP,
                                             OptimizationRemarkEmitter *ORE) {}

void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::PeelingPreferences &PP) {}

Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                         Type *ExpectedType) {}

bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                        MemIntrinsicInfo &Info) {}

/// See if \p I should be considered for address type promotion. We check if \p
/// I is a sext with right type and used in memory accesses. If it used in a
/// "complex" getelementptr, we allow it to be promoted without finding other
/// sext instructions that sign extended the same initial value. A getelementptr
/// is considered as "complex" if it has more than 2 operands.
bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {}

bool AArch64TTIImpl::isLegalToVectorizeReduction(
    const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {}

InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                       FastMathFlags FMF,
                                       TTI::TargetCostKind CostKind) {}

InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
    unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {}

InstructionCost
AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                           std::optional<FastMathFlags> FMF,
                                           TTI::TargetCostKind CostKind) {}

InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) {}

InstructionCost AArch64TTIImpl::getShuffleCost(
    TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
    TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
    ArrayRef<const Value *> Args, const Instruction *CxtI) {}

static bool containsDecreasingPointers(Loop *TheLoop,
                                       PredicatedScalarEvolution *PSE) {}

bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {}

InstructionCost
AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                     StackOffset BaseOffset, bool HasBaseReg,
                                     int64_t Scale, unsigned AddrSpace) const {}

bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {}

bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                                   const TargetTransformInfo::LSRCost &C2) {}

static bool isSplatShuffle(Value *V) {}

/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
/// or upper half of the vector elements.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
                                     bool AllowSplat = false) {}

/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
/// of the vector elements.
static bool areExtractExts(Value *Ext1, Value *Ext2) {}

/// Check if Op could be used with vmull_high_p64 intrinsic.
static bool isOperandOfVmullHighP64(Value *Op) {}

/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {}

static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {}

/// We want to sink following cases:
/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {}

/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
bool AArch64TTIImpl::isProfitableToSinkOperands(
    Instruction *I, SmallVectorImpl<Use *> &Ops) const {}