SIMachineScheduler.cpp | Explore in Territory

//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// SI Machine Scheduler interface
//
//===----------------------------------------------------------------------===//

#include "SIMachineScheduler.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

usingnamespacellvm;

#define DEBUG_TYPE …

// This scheduler implements a different scheduling algorithm than
// GenericScheduler.
//
// There are several specific architecture behaviours that can't be modelled
// for GenericScheduler:
// . When accessing the result of an SGPR load instruction, you have to wait
// for all the SGPR load instructions before your current instruction to
// have finished.
// . When accessing the result of an VGPR load instruction, you have to wait
// for all the VGPR load instructions previous to the VGPR load instruction
// you are interested in to finish.
// . The less the register pressure, the best load latencies are hidden
//
// Moreover some specifities (like the fact a lot of instructions in the shader
// have few dependencies) makes the generic scheduler have some unpredictable
// behaviours. For example when register pressure becomes high, it can either
// manage to prevent register pressure from going too high, or it can
// increase register pressure even more than if it hadn't taken register
// pressure into account.
//
// Also some other bad behaviours are generated, like loading at the beginning
// of the shader a constant in VGPR you won't need until the end of the shader.
//
// The scheduling problem for SI can distinguish three main parts:
// . Hiding high latencies (texture sampling, etc)
// . Hiding low latencies (SGPR constant loading, etc)
// . Keeping register usage low for better latency hiding and general
//   performance
//
// Some other things can also affect performance, but are hard to predict
// (cache usage, the fact the HW can issue several instructions from different
// wavefronts if different types, etc)
//
// This scheduler tries to solve the scheduling problem by dividing it into
// simpler sub-problems. It divides the instructions into blocks, schedules
// locally inside the blocks where it takes care of low latencies, and then
// chooses the order of the blocks by taking care of high latencies.
// Dividing the instructions into blocks helps control keeping register
// usage low.
//
// First the instructions are put into blocks.
//   We want the blocks help control register usage and hide high latencies
//   later. To help control register usage, we typically want all local
//   computations, when for example you create a result that can be consumed
//   right away, to be contained in a block. Block inputs and outputs would
//   typically be important results that are needed in several locations of
//   the shader. Since we do want blocks to help hide high latencies, we want
//   the instructions inside the block to have a minimal set of dependencies
//   on high latencies. It will make it easy to pick blocks to hide specific
//   high latencies.
//   The block creation algorithm is divided into several steps, and several
//   variants can be tried during the scheduling process.
//
// Second the order of the instructions inside the blocks is chosen.
//   At that step we do take into account only register usage and hiding
//   low latency instructions
//
// Third the block order is chosen, there we try to hide high latencies
// and keep register usage low.
//
// After the third step, a pass is done to improve the hiding of low
// latencies.
//
// Actually when talking about 'low latency' or 'high latency' it includes
// both the latency to get the cache (or global mem) data go to the register,
// and the bandwidth limitations.
// Increasing the number of active wavefronts helps hide the former, but it
// doesn't solve the latter, thus why even if wavefront count is high, we have
// to try have as many instructions hiding high latencies as possible.
// The OpenCL doc says for example latency of 400 cycles for a global mem
// access, which is hidden by 10 instructions if the wavefront count is 10.

// Some figures taken from AMD docs:
// Both texture and constant L1 caches are 4-way associative with 64 bytes
// lines.
// Constant cache is shared with 4 CUs.
// For texture sampling, the address generation unit receives 4 texture
// addresses per cycle, thus we could expect texture sampling latency to be
// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
// instructions in a wavefront group are executed every 4 cycles),
// or 16 instructions if the other wavefronts associated to the 3 other VALUs
// of the CU do texture sampling too. (Don't take these figures too seriously,
// as I'm not 100% sure of the computation)
// Data exports should get similar latency.
// For constant loading, the cache is shader with 4 CUs.
// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
// It means a simple s_buffer_load should take one instruction to hide, as
// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
// cache line.
//
// As of today the driver doesn't preload the constants in cache, thus the
// first loads get extra latency. The doc says global memory access can be
// 300-600 cycles. We do not specially take that into account when scheduling
// As we expect the driver to be able to preload the constants soon.

// common code //

#ifndef NDEBUG

static const char *getReasonStr(SIScheduleCandReason Reason) {
  switch (Reason) {
  case NoCand:         return "NOCAND";
  case RegUsage:       return "REGUSAGE";
  case Latency:        return "LATENCY";
  case Successor:      return "SUCCESSOR";
  case Depth:          return "DEPTH";
  case NodeOrder:      return "ORDER";
  }
  llvm_unreachable("Unknown reason!");
}

#endif

namespace llvm::SISched {
static bool tryLess(int TryVal, int CandVal,
                    SISchedulerCandidate &TryCand,
                    SISchedulerCandidate &Cand,
                    SIScheduleCandReason Reason) { … }

static bool tryGreater(int TryVal, int CandVal,
                       SISchedulerCandidate &TryCand,
                       SISchedulerCandidate &Cand,
                       SIScheduleCandReason Reason) { … }
} // end namespace llvm::SISched

// SIScheduleBlock //

void SIScheduleBlock::addUnit(SUnit *SU) { … }

#ifndef NDEBUG
void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {

  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
  dbgs() << '\n';
}
#endif

void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
                                          SISchedCandidate &TryCand) { … }

SUnit* SIScheduleBlock::pickNode() { … }


// Schedule something valid.
void SIScheduleBlock::fastSchedule() { … }

// Returns if the register was set between first and last.
static bool isDefBetween(unsigned Reg,
                           SlotIndex First, SlotIndex Last,
                           const MachineRegisterInfo *MRI,
                           const LiveIntervals *LIS) { … }

void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
                                      MachineBasicBlock::iterator EndBlock) { … }

void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
                               MachineBasicBlock::iterator EndBlock) { … }

void SIScheduleBlock::undoSchedule() { … }

void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { … }

void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { … }

/// Release Successors of the SU that are in the block or not.
void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { … }

void SIScheduleBlock::nodeScheduled(SUnit *SU) { … }

void SIScheduleBlock::finalizeUnits() { … }

// we maintain ascending order of IDs
void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { … }

void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
                              SIScheduleBlockLinkKind Kind) { … }

#ifndef NDEBUG
void SIScheduleBlock::printDebug(bool full) {
  dbgs() << "Block (" << ID << ")\n";
  if (!full)
    return;

  dbgs() << "\nContains High Latency Instruction: "
         << HighLatencyBlock << '\n';
  dbgs() << "\nDepends On:\n";
  for (SIScheduleBlock* P : Preds) {
    P->printDebug(false);
  }

  dbgs() << "\nSuccessors:\n";
  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
    if (S.second == SIScheduleBlockLinkKind::Data)
      dbgs() << "(Data Dep) ";
    S.first->printDebug(false);
  }

  if (Scheduled) {
    dbgs() << "LiveInPressure "
           << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
           << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
    dbgs() << "LiveOutPressure "
           << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
           << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
    dbgs() << "LiveIns:\n";
    for (unsigned Reg : LiveInRegs)
      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';

    dbgs() << "\nLiveOuts:\n";
    for (unsigned Reg : LiveOutRegs)
      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
  }

  dbgs() << "\nInstructions:\n";
  for (const SUnit* SU : SUnits)
      DAG->dumpNode(*SU);

  dbgs() << "///////////////////////\n";
}
#endif

// SIScheduleBlockCreator //

SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
    : … { … }

SIScheduleBlocks
SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { … }

bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { … }

void SIScheduleBlockCreator::colorHighLatenciesAlone() { … }

static bool
hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { … }

void SIScheduleBlockCreator::colorHighLatenciesGroups() { … }

void SIScheduleBlockCreator::colorComputeReservedDependencies() { … }

void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { … }

void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { … }


void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { … }

void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { … }

void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { … }

void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { … }

void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { … }

void SIScheduleBlockCreator::cutHugeBlocks() { … }

void SIScheduleBlockCreator::regroupNoUserInstructions() { … }

void SIScheduleBlockCreator::colorExports() { … }

void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { … }

// Two functions taken from Codegen/MachineScheduler.cpp

/// Non-const version.
static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,
            MachineBasicBlock::const_iterator End) { … }

void SIScheduleBlockCreator::topologicalSort() { … }

void SIScheduleBlockCreator::scheduleInsideBlocks() { … }

void SIScheduleBlockCreator::fillStats() { … }

// SIScheduleBlockScheduler //

SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
                                                   SISchedulerBlockSchedulerVariant Variant,
                                                   SIScheduleBlocks  BlocksStruct) : … { … }

bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
                                                   SIBlockSchedCandidate &TryCand) { … }

bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
                                                    SIBlockSchedCandidate &TryCand) { … }

SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { … }

// Tracking of currently alive registers to determine VGPR Usage.

void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { … }

void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
                                       std::set<unsigned> &Regs) { … }

void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { … }

void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { … }

std::vector<int>
SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
                                     std::set<unsigned> &OutRegs) { … }

// SIScheduler //

struct SIScheduleBlockResult
SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
                             SISchedulerBlockSchedulerVariant ScheduleVariant) { … }

// SIScheduleDAGMI //

SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : … { … }

SIScheduleDAGMI::~SIScheduleDAGMI() = default;

// Code adapted from scheduleDAG.cpp
// Does a topological sort over the SUs.
// Both TopDown and BottomUp
void SIScheduleDAGMI::topologicalSort() { … }

// Move low latencies further from their user without
// increasing SGPR usage (in general)
// This is to be replaced by a better pass that would
// take into account SGPR usage (based on VGPR Usage
// and the corresponding wavefront count), that would
// try to merge groups of loads if it make sense, etc
void SIScheduleDAGMI::moveLowLatencies() { … }

void SIScheduleDAGMI::restoreSULinksLeft() { … }

// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
template<typename _Iterator> void
SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
                                  unsigned &VgprUsage, unsigned &SgprUsage) { … }

void SIScheduleDAGMI::schedule()
{ … }
llvm/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp