AMDGPUTargetTransformInfo.cpp | Explore in Territory

//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// This file implements a TargetTransformInfo analysis pass specific to the
// AMDGPU target machine. It uses the target's detailed information to provide
// more precise answers to certain TTI queries, while letting the target
// independent and default TTI implementations handle the rest.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/KnownBits.h"
#include <optional>

usingnamespacellvm;

#define DEBUG_TYPE …

static cl::opt<unsigned> UnrollThresholdPrivate(
  "amdgpu-unroll-threshold-private",
  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
  cl::init(2700), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdLocal(
  "amdgpu-unroll-threshold-local",
  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
  cl::init(1000), cl::Hidden);

static cl::opt<unsigned> UnrollThresholdIf(
  "amdgpu-unroll-threshold-if",
  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
  cl::init(200), cl::Hidden);

static cl::opt<bool> UnrollRuntimeLocal(
  "amdgpu-unroll-runtime-local",
  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
  cl::init(true), cl::Hidden);

static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
    "amdgpu-unroll-max-block-to-analyze",
    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
    cl::init(32), cl::Hidden);

static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
                                       cl::Hidden, cl::init(4000),
                                       cl::desc("Cost of alloca argument"));

// If the amount of scratch memory to eliminate exceeds our ability to allocate
// it into registers we gain nothing by aggressively inlining functions for that
// heuristic.
static cl::opt<unsigned>
    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
                    cl::init(256),
                    cl::desc("Maximum alloca size to use for inline cost"));

// Inliner constraint to achieve reasonable compilation time.
static cl::opt<size_t> InlineMaxBB(
    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
    cl::desc("Maximum number of BBs allowed in a function after inlining"
             " (compile time constraint)"));

static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                              unsigned Depth = 0) { … }

AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
    : … { … }

void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP,
                                            OptimizationRemarkEmitter *ORE) { … }

void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::PeelingPreferences &PP) { … }

int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { … }

const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = …;

GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
    : … { … }

bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { … }

unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { … }

TypeSize
GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { … }

unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { … }

unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { … }

unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const { … }

unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                             unsigned ChainSizeInBytes,
                                             VectorType *VecTy) const { … }

unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { … }

bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
                                            Align Alignment,
                                            unsigned AddrSpace) const { … }

bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
                                             Align Alignment,
                                             unsigned AddrSpace) const { … }

bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
                                              Align Alignment,
                                              unsigned AddrSpace) const { … }

int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { … }

// FIXME: Really we would like to issue multiple 128-bit loads and stores per
// iteration. Should we report a larger size and let it legalize?
//
// FIXME: Should we use narrower types for local/region, or account for when
// unaligned access is legal?
//
// FIXME: This could use fine tuning and microbenchmarks.
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
    unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
    std::optional<uint32_t> AtomicElementSize) const { … }

void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
    Align SrcAlign, Align DestAlign,
    std::optional<uint32_t> AtomicCpySize) const { … }

unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) { … }

bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
                                       MemIntrinsicInfo &Info) const { … }

InstructionCost GCNTTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
    ArrayRef<const Value *> Args,
    const Instruction *CxtI) { … }

// Return true if there's a potential benefit from using v2f16/v2i16
// instructions for an intrinsic, even if it requires nontrivial legalization.
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { … }

InstructionCost
GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                  TTI::TargetCostKind CostKind) { … }

InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,
                                           TTI::TargetCostKind CostKind,
                                           const Instruction *I) { … }

InstructionCost
GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                       std::optional<FastMathFlags> FMF,
                                       TTI::TargetCostKind CostKind) { … }

InstructionCost
GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
                                   FastMathFlags FMF,
                                   TTI::TargetCostKind CostKind) { … }

InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                               TTI::TargetCostKind CostKind,
                                               unsigned Index, Value *Op0,
                                               Value *Op1) { … }

/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
/// this is analyzing the collective result of all output registers. Otherwise,
/// this is only querying a specific result index if this returns multiple
/// registers in a struct.
bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
  const CallInst *CI, ArrayRef<unsigned> Indices) const { … }

bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
    const IntrinsicInst *ReadReg) const { … }

/// \returns true if the result of the value could potentially be
/// different across workitems in a wavefront.
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { … }

bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { … }

bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                            Intrinsic::ID IID) const { … }

Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
                                                    Value *OldV,
                                                    Value *NewV) const { … }

InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                           VectorType *VT, ArrayRef<int> Mask,
                                           TTI::TargetCostKind CostKind,
                                           int Index, VectorType *SubTp,
                                           ArrayRef<const Value *> Args,
                                           const Instruction *CxtI) { … }

bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const { … }

static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
                                                   const SITargetLowering *TLI,
                                                   const GCNTTIImpl *TTIImpl) { … }

static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
                                           const DataLayout &DL) { … }

unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { … }

unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
                                         const AllocaInst *AI) const { … }

void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::UnrollingPreferences &UP,
                                         OptimizationRemarkEmitter *ORE) { … }

void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                       TTI::PeelingPreferences &PP) { … }

int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { … }

std::pair<InstructionCost, MVT>
GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { … }

unsigned GCNTTIImpl::getPrefetchDistance() const { … }

bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { … }
llvm/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp