//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// Insert wait instructions for memory reads and writes. /// /// Memory reads and writes are issued asynchronously, so we need to insert /// S_WAITCNT instructions when we want to access any of their results or /// overwrite any register that's used asynchronously. /// /// TODO: This pass currently keeps one timeline per hardware counter. A more /// finely-grained approach that keeps one timeline per event type could /// sometimes get away with generating weaker s_waitcnt instructions. For /// example, when both SMEM and LDS are in flight and we need to wait for /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, /// but the pass will currently generate a conservative lgkmcnt(0) because /// multiple event types are in flight. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" usingnamespacellvm; #define DEBUG_TYPE … DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs"); DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", "Force emit s_waitcnt lgkmcnt(0) instrs"); DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", "Force emit s_waitcnt vmcnt(0) instrs"); static cl::opt<bool> ForceEmitZeroFlag( "amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. enum InstCounterType { … }; } // namespace namespace llvm { template <> struct enum_iteration_traits<InstCounterType> { … }; } // namespace llvm namespace { // Return an iterator over all counters between LOAD_CNT (the first counter) // and \c MaxCounter (exclusive, default value yields an enumeration over // all counters). auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { … } RegInterval; struct HardwareLimits { … }; struct RegisterEncoding { … }; enum WaitEventType { … }; // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { … }; // Enumerate different types of result-returning VMEM operations. Although // s_waitcnt orders them all with a single vmcnt counter, in the absence of // s_waitcnt only instructions of the same VmemType are guaranteed to write // their results in order -- so there is no need to insert an s_waitcnt between // two instructions of the same type that write the same vgpr. enum VmemType { … }; // Maps values of InstCounterType to the instruction that waits on that // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() // returns true. static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = …; static bool updateVMCntOnly(const MachineInstr &Inst) { … } #ifndef NDEBUG static bool isNormalMode(InstCounterType MaxCounter) { return MaxCounter == NUM_NORMAL_INST_CNTS; } #endif // NDEBUG VmemType getVmemType(const MachineInstr &Inst) { … } unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { … } void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { … } void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { … } unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { … } // Mapping from event to counter according to the table masks. InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { … } // This objects maintains the current score brackets of each wait counter, and // a per-register scoreboard for each wait counter. // // We also maintain the latest score for every event type that can change the // waitcnt in order to know if there are multiple types of events within // the brackets. When multiple types of event happen in the bracket, // wait count may get decreased out of order, therefore we need to put in // "s_waitcnt 0" before use. class WaitcntBrackets { … }; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was // done because the set of counters and instructions for waiting on them // underwent a major shift with gfx12, sufficiently so that having this // abstraction allows the main analysis logic to be simpler than it would // otherwise have had to become. class WaitcntGenerator { … }; class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { … }; class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { … }; class SIInsertWaitcnts : public MachineFunctionPass { … }; } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, const MachineOperand &Op) const { … } void WaitcntBrackets::setScoreByInterval(RegInterval Interval, InstCounterType CntTy, unsigned Score) { … } void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, const MachineOperand &Op, InstCounterType CntTy, unsigned Score) { … } void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { … } void WaitcntBrackets::print(raw_ostream &OS) { … } /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { … } void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { … } void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, AMDGPU::Waitcnt &Wait) const { … } void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { … } void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { … } // Where there are multiple types of event in the bracket of a counter, // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { … } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) char SIInsertWaitcnts::ID = …; char &llvm::SIInsertWaitcntsID = …; FunctionPass *llvm::createSIInsertWaitcntsPass() { … } static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc) { … } /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, /// and if so, which counter it is waiting on. static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { … } bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { … } /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits /// from \p Wait that were added by previous passes. Currently this pass /// conservatively assumes that these preexisting waits are required for /// correctness. bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { … } /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any /// required counters in \p Wait bool WaitcntGeneratorPreGFX12::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, AMDGPU::Waitcnt Wait) { … } AMDGPU::Waitcnt WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { … } AMDGPU::Waitcnt WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { … } /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that /// were added by previous passes. Currently this pass conservatively /// assumes that these preexisting waits are required for correctness. bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { … } /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, AMDGPU::Waitcnt Wait) { … } static bool readsVCCZ(const MachineInstr &MI) { … } /// \returns true if the callee inserts an s_waitcnt 0 on function entry. static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { … } /// \returns true if the callee is expected to wait for any outstanding waits /// before returning. static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { … } /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. /// We rely on this in-order completion /// and simply assign a score to the memory access instructions. /// We keep track of the active "score bracket" to determine /// if an access of a memory read requires an s_waitcnt /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to /// flush the vmcnt counter here. bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, bool FlushVmCnt) { … } bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, MachineBasicBlock::instr_iterator It, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { … } // This is a flat memory operation. Check to see if it has memory tokens other // than LDS. Other address spaces supported by flat memory operations involve // global memory. bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { … } // This is a flat memory operation. Check to see if it has memory tokens for // either LDS or FLAT. bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { … } // This is a flat memory operation. Check to see if it has memory tokens for // either scratch or FLAT. bool SIInsertWaitcnts::mayAccessScratchThroughFlat( const MachineInstr &MI) const { … } static bool isCacheInvOrWBInst(MachineInstr &Inst) { … } void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) { … } bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, unsigned OtherScore) { … } /// Merge the pending events and associater score brackets of \p Other into /// this brackets status. /// /// Returns whether the merge resulted in a change that requires tighter waits /// (i.e. the merged brackets strictly dominate the original brackets). bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { … } static bool isWaitInstr(MachineInstr &Inst) { … } // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets) { … } // Return true if the given machine basic block is a preheader of a loop in // which we want to flush the vmcnt counter, and false otherwise. bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets) { … } bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { … } // Return true if it is better to flush the vmcnt counter in the preheader of // the given loop. We currently decide to flush in two situations: // 1. The loop contains vmem store(s), no vmem load and at least one use of a // vgpr containing a value that is loaded outside of the loop. (Only on // targets with no vscnt counter). // 2. The loop contains vmem load(s), but the loaded values are not used in the // loop, and at least one use of a vgpr containing a value that is loaded // outside of the loop. bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets) { … } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { … }