SIInsertWaitcnts.cpp | Explore in Territory

//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert wait instructions for memory reads and writes.
///
/// Memory reads and writes are issued asynchronously, so we need to insert
/// S_WAITCNT instructions when we want to access any of their results or
/// overwrite any register that's used asynchronously.
///
/// TODO: This pass currently keeps one timeline per hardware counter. A more
/// finely-grained approach that keeps one timeline per event type could
/// sometimes get away with generating weaker s_waitcnt instructions. For
/// example, when both SMEM and LDS are in flight and we need to wait for
/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
/// but the pass will currently generate a conservative lgkmcnt(0) because
/// multiple event types are in flight.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/TargetParser/TargetParser.h"
usingnamespacellvm;

#define DEBUG_TYPE …

DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
              "Force emit s_waitcnt expcnt(0) instrs");
DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
              "Force emit s_waitcnt lgkmcnt(0) instrs");
DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
              "Force emit s_waitcnt vmcnt(0) instrs");

static cl::opt<bool> ForceEmitZeroFlag(
  "amdgpu-waitcnt-forcezero",
  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
  cl::init(false), cl::Hidden);

namespace {
// Class of object that encapsulates latest instruction counter score
// associated with the operand.  Used for determining whether
// s_waitcnt instruction needs to be emitted.

enum InstCounterType { … };
} // namespace

namespace llvm {
template <> struct enum_iteration_traits<InstCounterType> { … };
} // namespace llvm

namespace {
// Return an iterator over all counters between LOAD_CNT (the first counter)
// and \c MaxCounter (exclusive, default value yields an enumeration over
// all counters).
auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { … }

RegInterval;

struct HardwareLimits { … };

struct RegisterEncoding { … };

enum WaitEventType { … };

// The mapping is:
//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
// We reserve a fixed number of VGPR slots in the scoring tables for
// special tokens like SCMEM_LDS (needed for buffer load to LDS).
enum RegisterMapping { … };

// Enumerate different types of result-returning VMEM operations. Although
// s_waitcnt orders them all with a single vmcnt counter, in the absence of
// s_waitcnt only instructions of the same VmemType are guaranteed to write
// their results in order -- so there is no need to insert an s_waitcnt between
// two instructions of the same type that write the same vgpr.
enum VmemType { … };

// Maps values of InstCounterType to the instruction that waits on that
// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
// returns true.
static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = …;

static bool updateVMCntOnly(const MachineInstr &Inst) { … }

#ifndef NDEBUG
static bool isNormalMode(InstCounterType MaxCounter) {
  return MaxCounter == NUM_NORMAL_INST_CNTS;
}
#endif // NDEBUG

VmemType getVmemType(const MachineInstr &Inst) { … }

unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { … }

void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { … }

void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { … }

unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { … }

// Mapping from event to counter according to the table masks.
InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { … }

// This objects maintains the current score brackets of each wait counter, and
// a per-register scoreboard for each wait counter.
//
// We also maintain the latest score for every event type that can change the
// waitcnt in order to know if there are multiple types of events within
// the brackets. When multiple types of event happen in the bracket,
// wait count may get decreased out of order, therefore we need to put in
// "s_waitcnt 0" before use.
class WaitcntBrackets { … };

// This abstracts the logic for generating and updating S_WAIT* instructions
// away from the analysis that determines where they are needed. This was
// done because the set of counters and instructions for waiting on them
// underwent a major shift with gfx12, sufficiently so that having this
// abstraction allows the main analysis logic to be simpler than it would
// otherwise have had to become.
class WaitcntGenerator { … };

class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { … };

class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { … };

class SIInsertWaitcnts : public MachineFunctionPass { … };

} // end anonymous namespace

RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
                                            const MachineRegisterInfo *MRI,
                                            const SIRegisterInfo *TRI,
                                            const MachineOperand &Op) const { … }

void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
                                         InstCounterType CntTy,
                                         unsigned Score) { … }

void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
                                        const SIRegisterInfo *TRI,
                                        const MachineRegisterInfo *MRI,
                                        const MachineOperand &Op,
                                        InstCounterType CntTy, unsigned Score) { … }

void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                    const SIRegisterInfo *TRI,
                                    const MachineRegisterInfo *MRI,
                                    WaitEventType E, MachineInstr &Inst) { … }

void WaitcntBrackets::print(raw_ostream &OS) { … }

/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { … }

void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
                                      unsigned &Count) const { … }

void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
                                    AMDGPU::Waitcnt &Wait) const { … }

void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { … }

void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { … }

// Where there are multiple types of event in the bracket of a counter,
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { … }

INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                      false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                    false)

char SIInsertWaitcnts::ID = …;

char &llvm::SIInsertWaitcntsID = …;

FunctionPass *llvm::createSIInsertWaitcntsPass() { … }

static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
                                     unsigned NewEnc) { … }

/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
/// and if so, which counter it is waiting on.
static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { … }

bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { … }

/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
/// from \p Wait that were added by previous passes. Currently this pass
/// conservatively assumes that these preexisting waits are required for
/// correctness.
bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { … }

/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
/// required counters in \p Wait
bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
    AMDGPU::Waitcnt Wait) { … }

AMDGPU::Waitcnt
WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { … }

AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { … }

/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
/// were added by previous passes. Currently this pass conservatively
/// assumes that these preexisting waits are required for correctness.
bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
    AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { … }

/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
    MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
    AMDGPU::Waitcnt Wait) { … }

static bool readsVCCZ(const MachineInstr &MI) { … }

/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { … }

/// \returns true if the callee is expected to wait for any outstanding waits
/// before returning.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { … }

///  Generate s_waitcnt instruction to be placed before cur_Inst.
///  Instructions of a given type are returned in order,
///  but instructions of different types can complete out of order.
///  We rely on this in-order completion
///  and simply assign a score to the memory access instructions.
///  We keep track of the active "score bracket" to determine
///  if an access of a memory read requires an s_waitcnt
///  and if so what the value of each counter is.
///  The "score bracket" is bound by the lower bound and upper bound
///  scores (*_score_LB and *_score_ub respectively).
///  If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
///  flush the vmcnt counter here.
bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                                                 WaitcntBrackets &ScoreBrackets,
                                                 MachineInstr *OldWaitcntInstr,
                                                 bool FlushVmCnt) { … }

bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
                                       MachineBasicBlock::instr_iterator It,
                                       MachineBasicBlock &Block,
                                       WaitcntBrackets &ScoreBrackets,
                                       MachineInstr *OldWaitcntInstr) { … }

// This is a flat memory operation. Check to see if it has memory tokens other
// than LDS. Other address spaces supported by flat memory operations involve
// global memory.
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { … }

// This is a flat memory operation. Check to see if it has memory tokens for
// either LDS or FLAT.
bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { … }

// This is a flat memory operation. Check to see if it has memory tokens for
// either scratch or FLAT.
bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
    const MachineInstr &MI) const { … }

static bool isCacheInvOrWBInst(MachineInstr &Inst) { … }

void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
                                               WaitcntBrackets *ScoreBrackets) { … }

bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
                                 unsigned OtherScore) { … }

/// Merge the pending events and associater score brackets of \p Other into
/// this brackets status.
///
/// Returns whether the merge resulted in a change that requires tighter waits
/// (i.e. the merged brackets strictly dominate the original brackets).
bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { … }

static bool isWaitInstr(MachineInstr &Inst) { … }

// Generate s_waitcnt instructions where needed.
bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                                            MachineBasicBlock &Block,
                                            WaitcntBrackets &ScoreBrackets) { … }

// Return true if the given machine basic block is a preheader of a loop in
// which we want to flush the vmcnt counter, and false otherwise.
bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
                                          WaitcntBrackets &ScoreBrackets) { … }

bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { … }

// Return true if it is better to flush the vmcnt counter in the preheader of
// the given loop. We currently decide to flush in two situations:
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
//    vgpr containing a value that is loaded outside of the loop. (Only on
//    targets with no vscnt counter).
// 2. The loop contains vmem load(s), but the loaded values are not used in the
//    loop, and at least one use of a vgpr containing a value that is loaded
//    outside of the loop.
bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
                                        WaitcntBrackets &Brackets) { … }

bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { … }
llvm/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp