SIISelLowering.cpp | Explore in Territory

//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Custom DAG lowering for SI
//
//===----------------------------------------------------------------------===//

#include "SIISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Transforms/Utils/LowerAtomic.h"
#include <optional>

usingnamespacellvm;

#define DEBUG_TYPE …

STATISTIC(NumTailCalls, "Number of tail calls");

static cl::opt<bool> DisableLoopAlignment(
  "amdgpu-disable-loop-alignment",
  cl::desc("Do not align and prefetch loops"),
  cl::init(false));

static cl::opt<bool> UseDivergentRegisterIndexing(
  "amdgpu-use-divergent-register-indexing",
  cl::Hidden,
  cl::desc("Use indirect register addressing for divergent indexes"),
  cl::init(false));

static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { … }

static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { … }

static unsigned findFirstFreeSGPR(CCState &CCInfo) { … }

SITargetLowering::SITargetLowering(const TargetMachine &TM,
                                   const GCNSubtarget &STI)
    : … { … }

const GCNSubtarget *SITargetLowering::getSubtarget() const { … }

ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const { … }

//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//

// v_mad_mix* support a conversion from f16 to f32.
//
// There is only one special case when denormals are enabled we don't currently,
// where this is OK to use.
bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
                                       EVT DestVT, EVT SrcVT) const { … }

bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
                                       LLT DestTy, LLT SrcTy) const { … }

bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { … }

MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                    CallingConv::ID CC,
                                                    EVT VT) const { … }

unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                         CallingConv::ID CC,
                                                         EVT VT) const { … }

unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
  LLVMContext &Context, CallingConv::ID CC,
  EVT VT, EVT &IntermediateVT,
  unsigned &NumIntermediates, MVT &RegisterVT) const { … }

static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
                                 const DataLayout &DL, Type *Ty,
                                 unsigned MaxNumLanes) { … }

// Peek through TFE struct returns to only use the data size.
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
                                   const DataLayout &DL, Type *Ty,
                                   unsigned MaxNumLanes) { … }

/// Map address space 7 to MVT::v5i32 because that's its in-memory
/// representation. This return value is vector-typed because there is no
/// MVT::i160 and it is not clear if one can be added. While this could
/// cause issues during codegen, these address space 7 pointers will be
/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
/// modeling, to work.
MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { … }
/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
/// v8i32 when padding is added.
/// The in-memory representation of a p9 is {p8, i32, i32}, which is
/// also v8i32 with padding.
MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { … }

bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                          const CallInst &CI,
                                          MachineFunction &MF,
                                          unsigned IntrID) const { … }

void SITargetLowering::CollectTargetIntrinsicOperands(
    const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { … }

bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
                                            SmallVectorImpl<Value*> &Ops,
                                            Type *&AccessTy) const { … }

bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
                                                 unsigned AddrSpace) const { … }

bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { … }

bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { … }

bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                             const AddrMode &AM, Type *Ty,
                                             unsigned AS, Instruction *I) const { … }

bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
                                        const MachineFunction &MF) const { … }

bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
    unsigned Size, unsigned AddrSpace, Align Alignment,
    MachineMemOperand::Flags Flags, unsigned *IsFast) const { … }

bool SITargetLowering::allowsMisalignedMemoryAccesses(
    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
    unsigned *IsFast) const { … }

EVT SITargetLowering::getOptimalMemOpType(
    const MemOp &Op, const AttributeList &FuncAttributes) const { … }

bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { … }

bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { … }

bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
                                           unsigned DestAS) const { … }

TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const { … }

bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                         Type *Ty) const { … }

bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                               unsigned Index) const { … }

bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { … }

SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
                                                   const SDLoc &SL,
                                                   SDValue Chain,
                                                   uint64_t Offset) const { … }

SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
                                            const SDLoc &SL) const { … }

SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
                                         const SDLoc &SL) const { … }

SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                         const SDLoc &SL, SDValue Val,
                                         bool Signed,
                                         const ISD::InputArg *Arg) const { … }

SDValue SITargetLowering::lowerKernargMemParameter(
    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
    uint64_t Offset, Align Alignment, bool Signed,
    const ISD::InputArg *Arg) const { … }

SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
                                              const SDLoc &SL, SDValue Chain,
                                              const ISD::InputArg &Arg) const { … }

SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
  const SIMachineFunctionInfo &MFI,
  EVT VT,
  AMDGPUFunctionArgInfo::PreloadedValue PVID) const { … }

static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
                               CallingConv::ID CallConv,
                               ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
                               FunctionType *FType,
                               SIMachineFunctionInfo *Info) { … }

// Allocate special inputs passed in VGPRs.
void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
                                                      MachineFunction &MF,
                                                      const SIRegisterInfo &TRI,
                                                      SIMachineFunctionInfo &Info) const { … }

// Try to allocate a VGPR at the end of the argument list, or if no argument
// VGPRs are left allocating a stack slot.
// If \p Mask is is given it indicates bitfield position in the register.
// If \p Arg is given use it with new ]p Mask instead of allocating new.
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
                                         ArgDescriptor Arg = ArgDescriptor()) { … }

static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
                                             const TargetRegisterClass *RC,
                                             unsigned NumArgRegs) { … }

// If this has a fixed position, we still should allocate the register in the
// CCInfo state. Technically we could get away with this for values passed
// outside of the normal argument range.
static void allocateFixedSGPRInputImpl(CCState &CCInfo,
                                       const TargetRegisterClass *RC,
                                       MCRegister Reg) { … }

static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { … }

static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { … }

/// Allocate implicit function VGPR arguments at the end of allocated user
/// arguments.
void SITargetLowering::allocateSpecialInputVGPRs(
  CCState &CCInfo, MachineFunction &MF,
  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { … }

/// Allocate implicit function VGPR arguments in fixed registers.
void SITargetLowering::allocateSpecialInputVGPRsFixed(
  CCState &CCInfo, MachineFunction &MF,
  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { … }

void SITargetLowering::allocateSpecialInputSGPRs(
  CCState &CCInfo,
  MachineFunction &MF,
  const SIRegisterInfo &TRI,
  SIMachineFunctionInfo &Info) const { … }

// Allocate special inputs passed in user SGPRs.
void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
                                            MachineFunction &MF,
                                            const SIRegisterInfo &TRI,
                                            SIMachineFunctionInfo &Info) const { … }

// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
    CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
    const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
    const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { … }

void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
                                           const SIRegisterInfo &TRI,
                                           SIMachineFunctionInfo &Info) const { … }

// Allocate special input registers that are initialized per-wave.
void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
                                           MachineFunction &MF,
                                           SIMachineFunctionInfo &Info,
                                           CallingConv::ID CallConv,
                                           bool IsShader) const { … }

static void reservePrivateMemoryRegs(const TargetMachine &TM,
                                     MachineFunction &MF,
                                     const SIRegisterInfo &TRI,
                                     SIMachineFunctionInfo &Info) { … }

bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { … }

void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { … }

void SITargetLowering::insertCopiesSplitCSR(
  MachineBasicBlock *Entry,
  const SmallVectorImpl<MachineBasicBlock *> &Exits) const { … }

SDValue SITargetLowering::LowerFormalArguments(
    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { … }

// TODO: If return values can't fit in registers, we should return as many as
// possible in registers before passing on stack.
bool SITargetLowering::CanLowerReturn(
  CallingConv::ID CallConv,
  MachineFunction &MF, bool IsVarArg,
  const SmallVectorImpl<ISD::OutputArg> &Outs,
  LLVMContext &Context) const { … }

SDValue
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                              bool isVarArg,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
                              const SDLoc &DL, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerCallResult(
    SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
    SDValue ThisVal) const { … }

// Add code to pass special inputs required depending on used features separate
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
    CallLoweringInfo &CLI,
    CCState &CCInfo,
    const SIMachineFunctionInfo &Info,
    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
    SmallVectorImpl<SDValue> &MemOpChains,
    SDValue Chain) const { … }

static bool canGuaranteeTCO(CallingConv::ID CC) { … }

/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) { … }

bool SITargetLowering::isEligibleForTailCallOptimization(
    SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
    const SmallVectorImpl<ISD::OutputArg> &Outs,
    const SmallVectorImpl<SDValue> &OutVals,
    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { … }

bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { … }

// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                                    SmallVectorImpl<SDValue> &InVals) const { … }

// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
    SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                  SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
                                            SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
                                            SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { … }

// Work around DAG legality rules only based on the result type.
SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const { … }

Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                             const MachineFunction &MF) const { … }

// If kill is not the last instruction, split the block so kill is always a
// proper terminator.
MachineBasicBlock *
SITargetLowering::splitKillBlock(MachineInstr &MI,
                                 MachineBasicBlock *BB) const { … }

// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
// \p MI will be the only instruction in the loop body block. Otherwise, it will
// be the first instruction in the remainder block.
//
/// \returns { LoopBody, Remainder }
static std::pair<MachineBasicBlock *, MachineBasicBlock *>
splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { … }

/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { … }

MachineBasicBlock *
SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
                                         MachineBasicBlock *BB) const { … }

// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
// wavefront. If the value is uniform and just happens to be in a VGPR, this
// will only do one iteration. In the worst case, this will loop 64 times.
//
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
                       MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
                       const DebugLoc &DL, const MachineOperand &Idx,
                       unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
                       unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
                       Register &SGPRIdxReg) { … }

// This has slightly sub-optimal regalloc when the source vector is killed by
// the read. The register allocator does not understand that the kill is
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
// subregister from it, using 1 more VGPR than necessary. This was saved when
// this was expanded after register allocation.
static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
               unsigned InitResultReg, unsigned PhiReg, int Offset,
               bool UseGPRIdxMode, Register &SGPRIdxReg) { … }

// Returns subreg index, offset
static std::pair<unsigned, int>
computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
                            const TargetRegisterClass *SuperRC,
                            unsigned VecReg,
                            int Offset) { … }

static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
                                 MachineRegisterInfo &MRI, MachineInstr &MI,
                                 int Offset) { … }

static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
                                   MachineRegisterInfo &MRI, MachineInstr &MI,
                                   int Offset) { … }

static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
                                          MachineBasicBlock &MBB,
                                          const GCNSubtarget &ST) { … }

static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
                                          MachineBasicBlock &MBB,
                                          const GCNSubtarget &ST) { … }

static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
                                          MachineBasicBlock &BB,
                                          const GCNSubtarget &ST,
                                          unsigned Opc) { … }

MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
  MachineInstr &MI, MachineBasicBlock *BB) const { … }

bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { … }

bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { … }

EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                                         EVT VT) const { … }

MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { … }

LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { … }

// Answering this is somewhat tricky and depends on the specific device which
// have different rates for fma or all f64 operations.
//
// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
// regardless of which device (although the number of cycles differs between
// devices), so it is always profitable for f64.
//
// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
// only on full rate devices. Normally, we should prefer selecting v_mad_f32
// which we can always do even without fused FP ops since it returns the same
// result as the separate operations and since it is always full
// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
// however does not support denormals, so we do report fma as faster if we have
// a fast fma device and require denormals.
//
bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                  EVT VT) const { … }

bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                  LLT Ty) const { … }

bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { … }

bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
                                   const SDNode *N) const { … }

//===----------------------------------------------------------------------===//
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
                                             SelectionDAG &DAG) const { … }

// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
// wider vector type is legal.
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
                                              SelectionDAG &DAG) const { … }

SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
                                              SelectionDAG &DAG) const { … }


SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { … }

// Used for D16: Casts the result of an instruction into the right vector,
// packs values if loads return unpacked values.
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
                                       const SDLoc &DL,
                                       SelectionDAG &DAG, bool Unpacked) { … }

SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
                                              MemSDNode *M,
                                              SelectionDAG &DAG,
                                              ArrayRef<SDValue> Ops,
                                              bool IsIntrinsic) const { … }

SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
                                             SelectionDAG &DAG,
                                             ArrayRef<SDValue> Ops) const { … }

static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
                                  SDNode *N, SelectionDAG &DAG) { … }

static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
                                  SDNode *N, SelectionDAG &DAG) { … }

static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
                                    SelectionDAG &DAG) { … }

static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                           SelectionDAG &DAG) { … }

void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                          SmallVectorImpl<SDValue> &Results,
                                          SelectionDAG &DAG) const { … }

/// Helper function for LowerBRCOND
static SDNode *findUser(SDValue Value, unsigned Opcode) { … }

unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { … }

bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { … }

bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { … }

bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { … }

bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { … }

/// This transforms the control flow intrinsics to get the branch destination as
/// last parameter, also switches branch target with BR if the need arise
SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
                                      SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
                                          SelectionDAG &DAG) const { … }

SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
                                            SDValue Op,
                                            const SDLoc &DL,
                                            EVT VT) const { … }

SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
                                               SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { … }

// Custom lowering for vector multiplications and s_mul_u64.
SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerTrapEndpgm(
    SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
    const SDLoc &DL, Align Alignment, ImplicitParameter Param) const { … }

SDValue SITargetLowering::lowerTrapHsaQueuePtr(
    SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerTrapHsa(
    SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                             SelectionDAG &DAG) const { … }

/// Return true if the value is a known valid address, such that a null check is
/// not necessary.
static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
                           const AMDGPUTargetMachine &TM, unsigned AddrSpace) { … }

SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
                                             SelectionDAG &DAG) const { … }

// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
// the small vector and inserting them into the big vector. That is better than
// the default expansion of doing it via a stack slot. Even though the use of
// the stack slot would be optimized away afterwards, the stack slot itself
// remains.
SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                                                SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
                                                 SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  SelectionDAG &DAG) const { … }

static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { … }

SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                              SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
                                                SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                            SelectionDAG &DAG) const { … }

bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { … }

static SDValue
buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
                        const SDLoc &DL, int64_t Offset, EVT PtrVT,
                        unsigned GAFlags = SIInstrInfo::MO_NONE) { … }

SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SDValue Op,
                                             SelectionDAG &DAG) const { … }

SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
                                   const SDLoc &DL, SDValue V) const { … }

SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                 SDValue Op,
                                                 MVT VT,
                                                 unsigned Offset) const { … }

static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
                                        EVT VT) { … }

static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
                                         EVT VT) { … }

static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
                                    ArrayRef<SDValue> Elts) { … }

static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
                              SDValue Src, int ExtraElts) { … }

// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
                                 ArrayRef<EVT> ResultTypes, bool IsTexFail,
                                 bool Unpacked, bool IsD16, int DMaskPop,
                                 int NumVDataDwords, bool IsAtomicPacked16Bit,
                                 const SDLoc &DL) { … }

static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
                         SDValue *LWE, bool &IsTexFail) { … }

static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
                                      MVT PackVectorVT,
                                      SmallVectorImpl<SDValue> &PackedAddrs,
                                      unsigned DimIdx, unsigned EndIdx,
                                      unsigned NumGradients) { … }

SDValue SITargetLowering::lowerImage(SDValue Op,
                                     const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                     SelectionDAG &DAG, bool WithChain) const { … }

SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
                                       SDValue Offset, SDValue CachePolicy,
                                       SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const { … }

SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
                                          unsigned Dim,
                                          const ArgDescriptor &Arg) const { … }

SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const { … }

// On targets not supporting constant in soffset field, turn zero to
// SGPR_NULL to avoid generating an extra s_mov with zero.
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
                             const GCNSubtarget *Subtarget) { … }

SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
                                                     SelectionDAG &DAG,
                                                     unsigned NewOpcode) const { … }

SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
                                                unsigned NewOpcode) const { … }

SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                 SelectionDAG &DAG) const { … }

// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI and handle TFE loads.
SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
                                              SDVTList VTList,
                                              ArrayRef<SDValue> Ops, EVT MemVT,
                                              MachineMemOperand *MMO,
                                              SelectionDAG &DAG) const { … }

SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
                                         bool ImageStore) const { … }

SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                              SelectionDAG &DAG) const { … }

// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
// offset (the offset that is included in bounds checking and swizzling, to be
// split between the instruction's voffset and immoffset fields) and soffset
// (the offset that is excluded from bounds checking and swizzling, to go in
// the instruction's soffset field).  This function takes the first kind of
// offset and figures out how to split it between voffset and immoffset.
std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
    SDValue Offset, SelectionDAG &DAG) const { … }

// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
                                        SelectionDAG &DAG, SDValue *Offsets,
                                        Align Alignment) const { … }

SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
                                                SelectionDAG &DAG) const { … }

// Wrap a global or flat pointer into a buffer intrinsic using the flags
// specified in the intrinsic.
SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
                                                   SelectionDAG &DAG) const { … }

// Handle 8 bit and 16 bit buffer loads
SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
                                                     EVT LoadVT, SDLoc DL,
                                                     ArrayRef<SDValue> Ops,
                                                     MachineMemOperand *MMO,
                                                     bool IsTFE) const { … }

// Handle 8 bit and 16 bit buffer stores
SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
                                                      EVT VDataType, SDLoc DL,
                                                      SDValue Ops[],
                                                      MemSDNode *M) const { … }

static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
                                 ISD::LoadExtType ExtType, SDValue Op,
                                 const SDLoc &SL, EVT VT) { … }

// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { … }

static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
                                          const SIMachineFunctionInfo &Info) { … }

SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { … }

// Catch division cases where we can use shortcuts with rcp and rsq
// instructions.
SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
                                              SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
                                                SelectionDAG &DAG) const { … }

static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,
                          SDNodeFlags Flags) { … }

static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                           EVT VT, SDValue A, SDValue B, SDValue C,
                           SDValue GlueChain, SDNodeFlags Flags) { … }

SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { … }

// Faster 2.5 ULP division that does not support denormals.
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { … }

// Returns immediate value for setting the F32 denorm mode when using the
// S_DENORM_MODE instruction.
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
                                    const SIMachineFunctionInfo *Info,
                                    const GCNSubtarget *ST) { … }

SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { … }

// Avoid the full correct expansion for f32 sqrt when promoting from f16.
SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { … }

SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { … }

//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//

SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
                                                     DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const { … }

// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
// bits

// This is a variant of
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
//
// The normal DAG combiner will do this, but only if the add has one use since
// that would increase the number of instructions.
//
// This prevents us from seeing a constant offset that can be folded into a
// memory instruction's addressing mode. If we know the resulting add offset of
// a pointer can be folded into an addressing offset, we can replace the pointer
// operand with the add of new constant offset. This eliminates one of the uses,
// and may allow the remaining use to also be simplified.
//
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
                                               unsigned AddrSpace,
                                               EVT MemVT,
                                               DAGCombinerInfo &DCI) const { … }

/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
/// by the chain and intrinsic ID. Theoretically we would also need to check the
/// specific intrinsic, but they all place the pointer operand first.
static unsigned getBasePtrIndex(const MemSDNode *N) { … }

SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
                                                  DAGCombinerInfo &DCI) const { … }

static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) { … }

// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
// integer combine opportunities since most 64-bit operations are decomposed
// this way.  TODO: We won't want this for SALU especially if it is an inline
// immediate.
SDValue SITargetLowering::splitBinaryBitConstantOp(
  DAGCombinerInfo &DCI,
  const SDLoc &SL,
  unsigned Opc, SDValue LHS,
  const ConstantSDNode *CRHS) const { … }

bool llvm::isBoolSGPR(SDValue V) { … }

// If a constant has all zeroes or all ones within each byte return it.
// Otherwise return 0.
static uint32_t getConstantPermuteMask(uint32_t C) { … }

// Check if a node selects whole bytes from its operand 0 starting at a byte
// boundary while masking the rest. Returns select mask as in the v_perm_b32
// or -1 if not succeeded.
// Note byte select encoding:
// value 0-3 selects corresponding source byte;
// value 0xc selects zero;
// value 0xff selects 0xff.
static uint32_t getPermuteMask(SDValue V) { … }

SDValue SITargetLowering::performAndCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

// A key component of v_perm is a mapping between byte position of the src
// operands, and the byte position of the dest. To provide such, we need: 1. the
// node that provides x byte of the dest of the OR, and 2. the byte of the node
// used to provide that x byte. calculateByteProvider finds which node provides
// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
// and finds an ultimate src and byte position For example: The supported
// LoadCombine pattern for vector loads is as follows
//                                t1
//                                or
//                      /                  \
//                      t2                 t3
//                     zext                shl
//                      |                   |     \
//                     t4                  t5     16
//                     or                 anyext
//                 /        \               |
//                t6        t7             t8
//               srl        shl             or
//            /    |      /     \         /     \
//           t9   t10    t11   t12      t13    t14
//         trunc*  8    trunc*  8      and     and
//           |            |          /    |     |    \
//          t15          t16        t17  t18   t19   t20
//                                trunc*  255   srl   -256
//                                   |         /   \
//                                  t15       t15  16
//
// *In this example, the truncs are from i32->i16
//
// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
// respectively. calculateSrcByte would find (given node) -> ultimate src &
// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
// After finding the mapping, we can combine the tree into vperm t15, t16,
// 0x05000407

// Find the source and byte position from a node.
// \p DestByte is the byte position of the dest of the or that the src
// ultimately provides. \p SrcIndex is the byte of the src that maps to this
// dest of the or byte. \p Depth tracks how many recursive iterations we have
// performed.
static const std::optional<ByteProvider<SDValue>>
calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
                 unsigned Depth = 0) { … }

// For a byte position in the result of an Or, traverse the tree and find the
// node (and the byte of the node) which ultimately provides this {Or,
// BytePosition}. \p Op is the operand we are currently examining. \p Index is
// the byte position of the Op that corresponds with the originally requested
// byte of the Or \p Depth tracks how many recursive iterations we have
// performed. \p StartingIndex is the originally requested byte of the Or
static const std::optional<ByteProvider<SDValue>>
calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
                      unsigned StartingIndex = 0) { … }

// Returns true if the Operand is a scalar and is 16 bits
static bool isExtendedFrom16Bits(SDValue &Operand) { … }

// Returns true if the mask matches consecutive bytes, and the first byte
// begins at a power of 2 byte offset from 0th byte
static bool addresses16Bits(int Mask) { … }

// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
                                SDValue &OtherOp) { … }

static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
                                  unsigned DWordOffset) { … }

static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { … }

SDValue SITargetLowering::performOrCombine(SDNode *N,
                                           DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performXorCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const { … }

SDValue
SITargetLowering::performSignExtendInRegCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performClassCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performRcpCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
                                       unsigned MaxDepth) const { … }

bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
                                       unsigned MaxDepth) const { … }

// Constant fold canonicalize.
SDValue SITargetLowering::getCanonicalConstantFP(
  SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { … }

static bool vectorEltWillFoldAway(SDValue Op) { … }

SDValue SITargetLowering::performFCanonicalizeCombine(
  SDNode *N,
  DAGCombinerInfo &DCI) const { … }

static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { … }

SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
                                                   const SDLoc &SL, SDValue Src,
                                                   SDValue MinVal,
                                                   SDValue MaxVal,
                                                   bool Signed) const { … }

static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { … }

SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
                                                  const SDLoc &SL,
                                                  SDValue Op0,
                                                  SDValue Op1) const { … }

/// \return true if the subtarget supports minimum3 and maximum3 with the given
/// base min/max opcode \p Opc for type \p VT.
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
                             EVT VT) { … }

SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const { … }

static bool isClampZeroToOne(SDValue A, SDValue B) { … }

// FIXME: Should only worry about snans for version with chain.
SDValue SITargetLowering::performFMed3Combine(SDNode *N,
                                              DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const { … }

// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
// expanded into a set of cmp/select instructions.
bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
                                                unsigned NumElem,
                                                bool IsDivergentIdx,
                                                const GCNSubtarget *Subtarget) { … }

bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const { … }

SDValue SITargetLowering::performExtractVectorEltCombine(
  SDNode *N, DAGCombinerInfo &DCI) const { … }

SDValue
SITargetLowering::performInsertVectorEltCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const { … }

/// Return the source of an fp_extend from f16 to f32, or a converted FP
/// constant.
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { … }

SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const { … }

unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                          const SDNode *N0,
                                          const SDNode *N1) const { … }

// For a reassociatable opcode perform:
// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
                                               SelectionDAG &DAG) const { … }

static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
                           EVT VT,
                           SDValue N0, SDValue N1, SDValue N2,
                           bool Signed) { … }

// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
// multiplies, if any.
//
// Full 64-bit multiplies that feed into an addition are lowered here instead
// of using the generic expansion. The generic expansion ends up with
// a tree of ADD nodes that prevents us from using the "add" part of the
// MAD instruction. The expansion produced here results in a chain of ADDs
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

// Collect the ultimate src of each of the mul node's operands, and confirm
// each operand is 8 bytes.
static std::optional<ByteProvider<SDValue>>
handleMulOperand(const SDValue &MulOperand) { … }

static unsigned addPermMasks(unsigned First, unsigned Second) { … }

struct DotSrc { … };

static void placeSources(ByteProvider<SDValue> &Src0,
                         ByteProvider<SDValue> &Src1,
                         SmallVectorImpl<DotSrc> &Src0s,
                         SmallVectorImpl<DotSrc> &Src1s, int Step) { … }

static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
                              SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
                              bool IsAny) { … }

static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) { … }

static bool isMul(const SDValue Op) { … }

static std::optional<bool>
checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
                       ByteProvider<SDValue> &Src1, const SDValue &S0Op,
                       const SDValue &S1Op, const SelectionDAG &DAG) { … }

SDValue SITargetLowering::performAddCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performSubCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
  DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performFAddCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performFSubCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performFDivCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performFMACombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
                                                     DAGCombinerInfo &DCI) const { … }

SDValue SITargetLowering::performClampCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const { … }


SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                            DAGCombinerInfo &DCI) const { … }static unsigned SubIdx2Lane(unsigned Idx) { … }SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                          SelectionDAG &DAG) const { … }static bool isFrameIndexOp(SDValue Op) { … }SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
                                                        SelectionDAG &DAG) const { … }SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                          SelectionDAG &DAG) const { … }void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { … }void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                     SDNode *Node) const { … }static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
                              uint64_t Val) { … }MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                const SDLoc &DL,
                                                SDValue Ptr) const { … }MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
                                           SDValue Ptr, uint32_t RsrcDword1,
                                           uint64_t RsrcDword2And3) const { … }std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
                                               StringRef Constraint,
                                               MVT VT) const { … }static bool isImmConstraint(StringRef Constraint) { … }SITargetLowering::ConstraintType
SITargetLowering::getConstraintType(StringRef Constraint) const { … }static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { … }void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                    StringRef Constraint,
                                                    std::vector<SDValue> &Ops,
                                                    SelectionDAG &DAG) const { … }bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { … }bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
                                             uint64_t Val) const { … }bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
                                              unsigned MaxSize) const { … }static int getAlignedAGPRClassID(unsigned UnalignedClassID) { … }void SITargetLowering::finalizeLowering(MachineFunction &MF) const { … }void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                     KnownBits &Known,
                                                     const APInt &DemandedElts,
                                                     const SelectionDAG &DAG,
                                                     unsigned Depth) const { … }void SITargetLowering::computeKnownBitsForFrameIndex(
  const int FI, KnownBits &Known, const MachineFunction &MF) const { … }static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
                                   KnownBits &Known, unsigned Dim) { … }void SITargetLowering::computeKnownBitsForTargetInstr(
    GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
    const MachineRegisterInfo &MRI, unsigned Depth) const { … }Align SITargetLowering::computeKnownAlignForTargetInstr(
  GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
  unsigned Depth) const { … }Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { … }LLVM_ATTRIBUTE_UNUSED
static bool isCopyFromRegOfInlineAsm(const SDNode *N) { … }bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
                                                  FunctionLoweringInfo *FLI,
                                                  UniformityInfo *UA) const { … }bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
                                               EVT VT) const { … }bool SITargetLowering::denormalsEnabledForType(
    LLT Ty, const MachineFunction &MF) const { … }bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                    const SelectionDAG &DAG,
                                                    bool SNaN,
                                                    unsigned Depth) const { … }static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) { … }static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { … }static bool isV2F16OrV2BF16(Type *Ty) { … }static bool isV2F16(Type *Ty) { … }static bool isV2BF16(Type *Ty) { … }static bool isAtomicRMWLegalIntTy(Type *Ty) { … }static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) { … }static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
                                        const AtomicRMWInst *RMW,
                                        bool HasSystemScope) { … }static TargetLowering::AtomicExpansionKind
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { … }TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { … }TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { … }TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { … }TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const { … }const TargetRegisterClass *
SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { … }static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
                      unsigned WaveSize) { … }bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
                                               const Value *V) const { … }bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const { … }bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
                                           SDValue N1) const { … }bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
                                           Register N0, Register N1) const { … }MachineMemOperand::Flags
SITargetLowering::getTargetMMOFlags(const Instruction &I) const { … }bool SITargetLowering::checkForPhysRegDependency(
    SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
    const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const { … }void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { … }LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { … }
llvm/llvm/lib/Target/AMDGPU/SIISelLowering.cpp