SILoadStoreOptimizer.cpp | Explore in Territory

//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass tries to fuse DS instructions with close by immediate offsets.
// This will fuse operations such as
//  ds_read_b32 v0, v2 offset:16
//  ds_read_b32 v1, v2 offset:32
// ==>
//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
//
// The same is done for certain SMEM and VMEM opcodes, e.g.:
//  s_buffer_load_dword s4, s[0:3], 4
//  s_buffer_load_dword s5, s[0:3], 8
// ==>
//  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
//
// This pass also tries to promote constant offset to the immediate by
// adjusting the base. It tries to use a base from the nearby instructions that
// allows it to have a 13bit constant offset and then promotes the 13bit offset
// to the immediate.
// E.g.
//  s_movk_i32 s0, 0x1800
//  v_add_co_u32_e32 v0, vcc, s0, v2
//  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
//
//  s_movk_i32 s0, 0x1000
//  v_add_co_u32_e32 v5, vcc, s0, v2
//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
//  global_load_dwordx2 v[5:6], v[5:6], off
//  global_load_dwordx2 v[0:1], v[0:1], off
// =>
//  s_movk_i32 s0, 0x1000
//  v_add_co_u32_e32 v5, vcc, s0, v2
//  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
//  global_load_dwordx2 v[5:6], v[5:6], off
//  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
//
// Future improvements:
//
// - This is currently missing stores of constants because loading
//   the constant into the data register is placed between the stores, although
//   this is arguably a scheduling problem.
//
// - Live interval recomputing seems inefficient. This currently only matches
//   one pair, and recomputes live intervals and moves on to the next pair. It
//   would be better to compute a list of all merges that need to occur.
//
// - With a list of instructions to process, we can also merge more. If a
//   cluster of loads have offsets that are too large to fit in the 8-bit
//   offsets, but are close enough to fit in the 8 bits, we can add to the base
//   pointer and use the new reduced offsets.
//
//===----------------------------------------------------------------------===//

#include "SILoadStoreOptimizer.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"

usingnamespacellvm;

#define DEBUG_TYPE …

namespace {
enum InstClassEnum { … };

struct AddressRegs { … };

// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
const unsigned MaxAddressRegs = …;

class SILoadStoreOptimizer { … };

class SILoadStoreOptimizerLegacy : public MachineFunctionPass { … };

static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { … }

/// Maps instruction opcode to enum InstClassEnum.
static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { … }

/// Determines instruction subclass from opcode. Only instructions
/// of the same subclass can be merged together. The merged instruction may have
/// a different subclass but must have the same class.
static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { … }

// GLOBAL loads and stores are classified as FLAT initially. If both combined
// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
// If either or both instructions are non segment specific FLAT the resulting
// combined operation will be FLAT, potentially promoting one of the GLOBAL
// operations to FLAT.
// For other instructions return the original unmodified class.
InstClassEnum
SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
                                         const CombineInfo &Paired) { … }

static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { … }

void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
                                              const SILoadStoreOptimizer &LSO) { … }

} // end anonymous namespace.

INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
                      "SI Load Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
                    "SI Load Store Optimizer", false, false)

char SILoadStoreOptimizerLegacy::ID = …;

char &llvm::SILoadStoreOptimizerLegacyID = …;

FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() { … }

static void addDefsUsesToList(const MachineInstr &MI,
                              DenseSet<Register> &RegDefs,
                              DenseSet<Register> &RegUses) { … }

bool SILoadStoreOptimizer::canSwapInstructions(
    const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
    const MachineInstr &A, const MachineInstr &B) const { … }

// Given that \p CI and \p Paired are adjacent memory operations produce a new
// MMO for the combined operation with a new access size.
MachineMemOperand *
SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
                                               const CombineInfo &Paired) { … }

bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
                                               const SIInstrInfo &TII,
                                               const CombineInfo &Paired) { … }

static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
                                       unsigned ComponentCount,
                                       const GCNSubtarget &STI) { … }

// Return the value in the inclusive range [Lo,Hi] that is aligned to the
// highest power of two. Note that the result is well defined for all inputs
// including corner cases like:
// - if Lo == Hi, return that value
// - if Lo == 0, return 0 (even though the "- 1" below underflows
// - if Lo > Hi, return 0 (as if the range wrapped around)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { … }

bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
                                                const GCNSubtarget &STI,
                                                CombineInfo &Paired,
                                                bool Modify) { … }

bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
                                     const CombineInfo &CI,
                                     const CombineInfo &Paired) { … }

const TargetRegisterClass *
SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { … }

/// This function assumes that CI comes before Paired in a basic block. Return
/// an insertion point for the merged instruction or nullptr on failure.
SILoadStoreOptimizer::CombineInfo *
SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
                                           CombineInfo &Paired) { … }

// Copy the merged load result from DestReg to the original dest regs of CI and
// Paired.
void SILoadStoreOptimizer::copyToDestRegs(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore, int OpName,
    Register DestReg) const { … }

// Return a register for the source of the merged store after copying the
// original source regs of CI and Paired into it.
Register
SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
                                      MachineBasicBlock::iterator InsertBefore,
                                      int OpName) const { … }

unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { … }

unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { … }

MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
                                     MachineBasicBlock::iterator InsertBefore) { … }

unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { … }

unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator
SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
                                     MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

static bool needsConstrainedOpcode(const GCNSubtarget &STM,
                                   ArrayRef<MachineMemOperand *> MMOs,
                                   unsigned Width) { … }

unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                            const CombineInfo &Paired) { … }

std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
                                    const CombineInfo &Paired) { … }

const TargetRegisterClass *
SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
                                             const CombineInfo &Paired) const { … }

MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
    CombineInfo &CI, CombineInfo &Paired,
    MachineBasicBlock::iterator InsertBefore) { … }

MachineOperand
SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { … }

// Compute base address using Addr and return the final register.
Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
                                           const MemAddress &Addr) const { … }

// Update base and offset with the NewBase and NewOffset in MI.
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
                                               Register NewBase,
                                               int32_t NewOffset) const { … }

std::optional<int32_t>
SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { … }

// Analyze Base and extracts:
//  - 32bit base registers, subregisters
//  - 64bit constant offset
// Expecting base computation as:
//   %OFFSET0:sgpr_32 = S_MOV_B32 8000
//   %LO:vgpr_32, %c:sreg_64_xexec =
//       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
//   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
//   %Base:vreg_64 =
//       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
                                                      MemAddress &Addr) const { … }

bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
    MachineInstr &MI,
    MemInfoMap &Visited,
    SmallPtrSet<MachineInstr *, 4> &AnchorList) const { … }

void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
                 std::list<std::list<CombineInfo> > &MergeableInsts) const { … }

std::pair<MachineBasicBlock::iterator, bool>
SILoadStoreOptimizer::collectMergeableInsts(
    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
    std::list<std::list<CombineInfo>> &MergeableInsts) const { … }

// Scan through looking for adjacent LDS operations with constant offsets from
// the same base register. We rely on the scheduler to do the hard work of
// clustering nearby loads, and assume these are all adjacent.
bool SILoadStoreOptimizer::optimizeBlock(
                       std::list<std::list<CombineInfo> > &MergeableInsts) { … }

bool
SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
                                          std::list<CombineInfo> &MergeList,
                                          bool &OptimizeListAgain) { … }

bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { … }

bool SILoadStoreOptimizer::run(MachineFunction &MF) { … }

PreservedAnalyses
SILoadStoreOptimizerPass::run(MachineFunction &MF,
                              MachineFunctionAnalysisManager &MFAM) { … }
llvm/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp