//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// SI Machine Scheduler interface // //===----------------------------------------------------------------------===// #include "SIMachineScheduler.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineRegisterInfo.h" usingnamespacellvm; #define DEBUG_TYPE … // This scheduler implements a different scheduling algorithm than // GenericScheduler. // // There are several specific architecture behaviours that can't be modelled // for GenericScheduler: // . When accessing the result of an SGPR load instruction, you have to wait // for all the SGPR load instructions before your current instruction to // have finished. // . When accessing the result of an VGPR load instruction, you have to wait // for all the VGPR load instructions previous to the VGPR load instruction // you are interested in to finish. // . The less the register pressure, the best load latencies are hidden // // Moreover some specifities (like the fact a lot of instructions in the shader // have few dependencies) makes the generic scheduler have some unpredictable // behaviours. For example when register pressure becomes high, it can either // manage to prevent register pressure from going too high, or it can // increase register pressure even more than if it hadn't taken register // pressure into account. // // Also some other bad behaviours are generated, like loading at the beginning // of the shader a constant in VGPR you won't need until the end of the shader. // // The scheduling problem for SI can distinguish three main parts: // . Hiding high latencies (texture sampling, etc) // . Hiding low latencies (SGPR constant loading, etc) // . Keeping register usage low for better latency hiding and general // performance // // Some other things can also affect performance, but are hard to predict // (cache usage, the fact the HW can issue several instructions from different // wavefronts if different types, etc) // // This scheduler tries to solve the scheduling problem by dividing it into // simpler sub-problems. It divides the instructions into blocks, schedules // locally inside the blocks where it takes care of low latencies, and then // chooses the order of the blocks by taking care of high latencies. // Dividing the instructions into blocks helps control keeping register // usage low. // // First the instructions are put into blocks. // We want the blocks help control register usage and hide high latencies // later. To help control register usage, we typically want all local // computations, when for example you create a result that can be consumed // right away, to be contained in a block. Block inputs and outputs would // typically be important results that are needed in several locations of // the shader. Since we do want blocks to help hide high latencies, we want // the instructions inside the block to have a minimal set of dependencies // on high latencies. It will make it easy to pick blocks to hide specific // high latencies. // The block creation algorithm is divided into several steps, and several // variants can be tried during the scheduling process. // // Second the order of the instructions inside the blocks is chosen. // At that step we do take into account only register usage and hiding // low latency instructions // // Third the block order is chosen, there we try to hide high latencies // and keep register usage low. // // After the third step, a pass is done to improve the hiding of low // latencies. // // Actually when talking about 'low latency' or 'high latency' it includes // both the latency to get the cache (or global mem) data go to the register, // and the bandwidth limitations. // Increasing the number of active wavefronts helps hide the former, but it // doesn't solve the latter, thus why even if wavefront count is high, we have // to try have as many instructions hiding high latencies as possible. // The OpenCL doc says for example latency of 400 cycles for a global mem // access, which is hidden by 10 instructions if the wavefront count is 10. // Some figures taken from AMD docs: // Both texture and constant L1 caches are 4-way associative with 64 bytes // lines. // Constant cache is shared with 4 CUs. // For texture sampling, the address generation unit receives 4 texture // addresses per cycle, thus we could expect texture sampling latency to be // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, // instructions in a wavefront group are executed every 4 cycles), // or 16 instructions if the other wavefronts associated to the 3 other VALUs // of the CU do texture sampling too. (Don't take these figures too seriously, // as I'm not 100% sure of the computation) // Data exports should get similar latency. // For constant loading, the cache is shader with 4 CUs. // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" // I guess if the other CU don't read the cache, it can go up to 64B/cycle. // It means a simple s_buffer_load should take one instruction to hide, as // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same // cache line. // // As of today the driver doesn't preload the constants in cache, thus the // first loads get extra latency. The doc says global memory access can be // 300-600 cycles. We do not specially take that into account when scheduling // As we expect the driver to be able to preload the constants soon. // common code // #ifndef NDEBUG static const char *getReasonStr(SIScheduleCandReason Reason) { switch (Reason) { case NoCand: return "NOCAND"; case RegUsage: return "REGUSAGE"; case Latency: return "LATENCY"; case Successor: return "SUCCESSOR"; case Depth: return "DEPTH"; case NodeOrder: return "ORDER"; } llvm_unreachable("Unknown reason!"); } #endif namespace llvm::SISched { static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason) { … } static bool tryGreater(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason) { … } } // end namespace llvm::SISched // SIScheduleBlock // void SIScheduleBlock::addUnit(SUnit *SU) { … } #ifndef NDEBUG void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); dbgs() << '\n'; } #endif void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand) { … } SUnit* SIScheduleBlock::pickNode() { … } // Schedule something valid. void SIScheduleBlock::fastSchedule() { … } // Returns if the register was set between first and last. static bool isDefBetween(unsigned Reg, SlotIndex First, SlotIndex Last, const MachineRegisterInfo *MRI, const LiveIntervals *LIS) { … } void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, MachineBasicBlock::iterator EndBlock) { … } void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, MachineBasicBlock::iterator EndBlock) { … } void SIScheduleBlock::undoSchedule() { … } void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { … } void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { … } /// Release Successors of the SU that are in the block or not. void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { … } void SIScheduleBlock::nodeScheduled(SUnit *SU) { … } void SIScheduleBlock::finalizeUnits() { … } // we maintain ascending order of IDs void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { … } void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind) { … } #ifndef NDEBUG void SIScheduleBlock::printDebug(bool full) { dbgs() << "Block (" << ID << ")\n"; if (!full) return; dbgs() << "\nContains High Latency Instruction: " << HighLatencyBlock << '\n'; dbgs() << "\nDepends On:\n"; for (SIScheduleBlock* P : Preds) { P->printDebug(false); } dbgs() << "\nSuccessors:\n"; for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { if (S.second == SIScheduleBlockLinkKind::Data) dbgs() << "(Data Dep) "; S.first->printDebug(false); } if (Scheduled) { dbgs() << "LiveInPressure " << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; dbgs() << "LiveOutPressure " << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; dbgs() << "LiveIns:\n"; for (unsigned Reg : LiveInRegs) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << "\nLiveOuts:\n"; for (unsigned Reg : LiveOutRegs) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; } dbgs() << "\nInstructions:\n"; for (const SUnit* SU : SUnits) DAG->dumpNode(*SU); dbgs() << "///////////////////////\n"; } #endif // SIScheduleBlockCreator // SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : … { … } SIScheduleBlocks SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { … } bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { … } void SIScheduleBlockCreator::colorHighLatenciesAlone() { … } static bool hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { … } void SIScheduleBlockCreator::colorHighLatenciesGroups() { … } void SIScheduleBlockCreator::colorComputeReservedDependencies() { … } void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { … } void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { … } void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { … } void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { … } void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { … } void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { … } void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { … } void SIScheduleBlockCreator::cutHugeBlocks() { … } void SIScheduleBlockCreator::regroupNoUserInstructions() { … } void SIScheduleBlockCreator::colorExports() { … } void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { … } // Two functions taken from Codegen/MachineScheduler.cpp /// Non-const version. static MachineBasicBlock::iterator nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::const_iterator End) { … } void SIScheduleBlockCreator::topologicalSort() { … } void SIScheduleBlockCreator::scheduleInsideBlocks() { … } void SIScheduleBlockCreator::fillStats() { … } // SIScheduleBlockScheduler // SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, SISchedulerBlockSchedulerVariant Variant, SIScheduleBlocks BlocksStruct) : … { … } bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, SIBlockSchedCandidate &TryCand) { … } bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, SIBlockSchedCandidate &TryCand) { … } SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { … } // Tracking of currently alive registers to determine VGPR Usage. void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { … } void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs) { … } void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { … } void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { … } std::vector<int> SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, std::set<unsigned> &OutRegs) { … } // SIScheduler // struct SIScheduleBlockResult SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, SISchedulerBlockSchedulerVariant ScheduleVariant) { … } // SIScheduleDAGMI // SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : … { … } SIScheduleDAGMI::~SIScheduleDAGMI() = default; // Code adapted from scheduleDAG.cpp // Does a topological sort over the SUs. // Both TopDown and BottomUp void SIScheduleDAGMI::topologicalSort() { … } // Move low latencies further from their user without // increasing SGPR usage (in general) // This is to be replaced by a better pass that would // take into account SGPR usage (based on VGPR Usage // and the corresponding wavefront count), that would // try to merge groups of loads if it make sense, etc void SIScheduleDAGMI::moveLowLatencies() { … } void SIScheduleDAGMI::restoreSULinksLeft() { … } // Return the Vgpr and Sgpr usage corresponding to some virtual registers. template<typename _Iterator> void SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, unsigned &VgprUsage, unsigned &SgprUsage) { … } void SIScheduleDAGMI::schedule() { … }