ARMTargetTransformInfo.cpp | Explore in Territory

//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "ARMTargetTransformInfo.h"
#include "ARMSubtarget.h"
#include "MCTargetDesc/ARMAddressingModes.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/TargetParser/SubtargetFeature.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <optional>
#include <utility>

usingnamespacellvm;

#define DEBUG_TYPE …

static cl::opt<bool> EnableMaskedLoadStores(
  "enable-arm-maskedldst", cl::Hidden, cl::init(true),
  cl::desc("Enable the generation of masked loads and stores"));

static cl::opt<bool> DisableLowOverheadLoops(
  "disable-arm-loloops", cl::Hidden, cl::init(false),
  cl::desc("Disable the generation of low-overhead loops"));

static cl::opt<bool>
    AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                  cl::desc("Enable the generation of WLS loops"));

extern cl::opt<TailPredication::Mode> EnableTailPredication;

extern cl::opt<bool> EnableMaskedGatherScatters;

extern cl::opt<unsigned> MVEMaxSupportedInterleaveFactor;

/// Convert a vector load intrinsic into a simple llvm load instruction.
/// This is beneficial when the underlying object being addressed comes
/// from a constant, since we get constant-folding for free.
static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
                               InstCombiner::BuilderTy &Builder) { … }

bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const { … }

TTI::AddressingModeKind
ARMTTIImpl::getPreferredAddressingMode(const Loop *L,
                                       ScalarEvolution *SE) const { … }

std::optional<Instruction *>
ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { … }

std::optional<Value *> ARMTTIImpl::simplifyDemandedVectorEltsIntrinsic(
    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
    std::function<void(Instruction *, unsigned, APInt, APInt &)>
        SimplifyAndSetOp) const { … }

InstructionCost ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                          TTI::TargetCostKind CostKind) { … }

// Constants smaller than 256 fit in the immediate field of
// Thumb1 instructions so we return a zero cost and 1 otherwise.
InstructionCost ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                                  const APInt &Imm, Type *Ty) { … }

// Checks whether Inst is part of a min(max()) or max(min()) pattern
// that will match to an SSAT instruction. Returns the instruction being
// saturated, or null if no saturation pattern was found.
static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) { … }

// Look for a FP Saturation pattern, where the instruction can be simplified to
// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) { … }

InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                              const APInt &Imm, Type *Ty,
                                              TTI::TargetCostKind CostKind,
                                              Instruction *Inst) { … }

InstructionCost ARMTTIImpl::getCFInstrCost(unsigned Opcode,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) { … }

InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                             Type *Src,
                                             TTI::CastContextHint CCH,
                                             TTI::TargetCostKind CostKind,
                                             const Instruction *I) { … }

InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) { … }

InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                               Type *CondTy,
                                               CmpInst::Predicate VecPred,
                                               TTI::TargetCostKind CostKind,
                                               const Instruction *I) { … }

InstructionCost ARMTTIImpl::getAddressComputationCost(Type *Ty,
                                                      ScalarEvolution *SE,
                                                      const SCEV *Ptr) { … }

bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { … }

bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { … }

bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { … }

/// Given a memcpy/memset/memmove instruction, return the number of memory
/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
/// call is used.
int ARMTTIImpl::getNumMemOps(const IntrinsicInst *I) const { … }

InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) { … }

InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                           VectorType *Tp, ArrayRef<int> Mask,
                                           TTI::TargetCostKind CostKind,
                                           int Index, VectorType *SubTp,
                                           ArrayRef<const Value *> Args,
                                           const Instruction *CxtI) { … }

InstructionCost ARMTTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
    ArrayRef<const Value *> Args,
    const Instruction *CxtI) { … }

InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            MaybeAlign Alignment,
                                            unsigned AddressSpace,
                                            TTI::TargetCostKind CostKind,
                                            TTI::OperandValueInfo OpInfo,
                                            const Instruction *I) { … }

InstructionCost
ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                  unsigned AddressSpace,
                                  TTI::TargetCostKind CostKind) { … }

InstructionCost ARMTTIImpl::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) { … }

InstructionCost ARMTTIImpl::getGatherScatterOpCost(
    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { … }

InstructionCost
ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                       std::optional<FastMathFlags> FMF,
                                       TTI::TargetCostKind CostKind) { … }

InstructionCost ARMTTIImpl::getExtendedReductionCost(
    unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
    FastMathFlags FMF, TTI::TargetCostKind CostKind) { … }

InstructionCost
ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
                                   VectorType *ValTy,
                                   TTI::TargetCostKind CostKind) { … }

InstructionCost
ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                   FastMathFlags FMF,
                                   TTI::TargetCostKind CostKind) { … }

InstructionCost
ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind) { … }

bool ARMTTIImpl::isLoweredToCall(const Function *F) { … }

bool ARMTTIImpl::maybeLoweredToCall(Instruction &I) { … }

bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                          AssumptionCache &AC,
                                          TargetLibraryInfo *LibInfo,
                                          HardwareLoopInfo &HWLoopInfo) { … }

static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { … }

// To set up a tail-predicated loop, we need to know the total number of
// elements processed by that loop. Thus, we need to determine the element
// size and:
// 1) it should be uniform for all operations in the vector loop, so we
//    e.g. don't want any widening/narrowing operations.
// 2) it should be smaller than i64s because we don't have vector operations
//    that work on i64s.
// 3) we don't want elements to be reversed or shuffled, to make sure the
//    tail-predication masks/predicates the right lanes.
//
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                 const DataLayout &DL,
                                 const LoopAccessInfo *LAI) { … }

bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { … }

TailFoldingStyle
ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const { … }
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::UnrollingPreferences &UP,
                                         OptimizationRemarkEmitter *ORE) { … }

void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                       TTI::PeelingPreferences &PP) { … }

bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                       TTI::ReductionFlags Flags) const { … }

bool ARMTTIImpl::preferPredicatedReductionSelect(
    unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { … }

InstructionCost ARMTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                                 StackOffset BaseOffset,
                                                 bool HasBaseReg, int64_t Scale,
                                                 unsigned AddrSpace) const { … }

bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const { … }
llvm/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp