MachineScheduler.cpp | Explore in Territory

//===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// MachineScheduler schedules machine instructions after phi elimination. It
// preserves LiveIntervals so it can be invoked before register allocation.
//
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachinePassRegistry.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/CodeGen/ScheduleDFS.h"
#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
#include "llvm/CodeGen/SlotIndexes.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/CodeGenTypes/MachineValueType.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <iterator>
#include <limits>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>

usingnamespacellvm;

#define DEBUG_TYPE …

STATISTIC(NumClustered, "Number of load/store pairs clustered");

namespace llvm {

cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                           cl::desc("Force top-down list scheduling"));
cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
                            cl::desc("Force bottom-up list scheduling"));
namespace MISchedPostRASched {
enum Direction { … };
} // end namespace MISchedPostRASched
cl::opt<MISchedPostRASched::Direction> PostRADirection(
    "misched-postra-direction", cl::Hidden,
    cl::desc("Post reg-alloc list scheduling direction"),
    // Default to top-down because it was implemented first and existing targets
    // expect that behavior by default.
    cl::init(MISchedPostRASched::TopDown),
    cl::values(
        clEnumValN(MISchedPostRASched::TopDown, "topdown",
                   "Force top-down post reg-alloc list scheduling"),
        clEnumValN(MISchedPostRASched::BottomUp, "bottomup",
                   "Force bottom-up post reg-alloc list scheduling"),
        clEnumValN(MISchedPostRASched::Bidirectional, "bidirectional",
                   "Force bidirectional post reg-alloc list scheduling")));
cl::opt<bool>
DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                       cl::desc("Print critical path length to stdout"));

cl::opt<bool> VerifyScheduling(
    "verify-misched", cl::Hidden,
    cl::desc("Verify machine instrs before and after machine scheduling"));

#ifndef NDEBUG
cl::opt<bool> ViewMISchedDAGs(
    "view-misched-dags", cl::Hidden,
    cl::desc("Pop up a window to show MISched dags after they are processed"));
cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
                        cl::desc("Print schedule DAGs"));
cl::opt<bool> MISchedDumpReservedCycles(
    "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
    cl::desc("Dump resource usage at schedule boundary."));
cl::opt<bool> MischedDetailResourceBooking(
    "misched-detail-resource-booking", cl::Hidden, cl::init(false),
    cl::desc("Show details of invoking getNextResoufceCycle."));
#else
const bool ViewMISchedDAGs = …;
const bool PrintDAGs = …;
const bool MischedDetailResourceBooking = …;
#ifdef LLVM_ENABLE_DUMP
const bool MISchedDumpReservedCycles = false;
#endif // LLVM_ENABLE_DUMP
#endif // NDEBUG

} // end namespace llvm

#ifndef NDEBUG
/// In some situations a few uninteresting nodes depend on nearly all other
/// nodes in the graph, provide a cutoff to hide them.
static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
  cl::desc("Hide nodes with more predecessor/successor than cutoff"));

static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));

static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
  cl::desc("Only schedule this function"));
static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
                                        cl::desc("Only schedule this MBB#"));
#endif // NDEBUG

/// Avoid quadratic complexity in unusually large basic blocks by limiting the
/// size of the ready lists.
static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
  cl::desc("Limit ready list to N instructions"), cl::init(256));

static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
  cl::desc("Enable register pressure scheduling."), cl::init(true));

static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
  cl::desc("Enable cyclic critical path analysis."), cl::init(true));

static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                        cl::desc("Enable memop clustering."),
                                        cl::init(true));
static cl::opt<bool>
    ForceFastCluster("force-fast-cluster", cl::Hidden,
                     cl::desc("Switch to fast cluster algorithm with the lost "
                              "of some fusion opportunities"),
                     cl::init(false));
static cl::opt<unsigned>
    FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
                         cl::desc("The threshold for fast cluster"),
                         cl::init(1000));

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
static cl::opt<bool> MISchedDumpScheduleTrace(
    "misched-dump-schedule-trace", cl::Hidden, cl::init(false),
    cl::desc("Dump resource usage at schedule boundary."));
static cl::opt<unsigned>
    HeaderColWidth("misched-dump-schedule-trace-col-header-width", cl::Hidden,
                   cl::desc("Set width of the columns with "
                            "the resources and schedule units"),
                   cl::init(19));
static cl::opt<unsigned>
    ColWidth("misched-dump-schedule-trace-col-width", cl::Hidden,
             cl::desc("Set width of the columns showing resource booking."),
             cl::init(5));
static cl::opt<bool> MISchedSortResourcesInTrace(
    "misched-sort-resources-in-trace", cl::Hidden, cl::init(true),
    cl::desc("Sort the resources printed in the dump trace"));
#endif

static cl::opt<unsigned>
    MIResourceCutOff("misched-resource-cutoff", cl::Hidden,
                     cl::desc("Number of intervals to track"), cl::init(10));

// DAG subtrees must have at least this many nodes.
static const unsigned MinSubtreeSize = …;

// Pin the vtables to this file.
void MachineSchedStrategy::anchor() { … }

void ScheduleDAGMutation::anchor() { … }

//===----------------------------------------------------------------------===//
// Machine Instruction Scheduling Pass and Registry
//===----------------------------------------------------------------------===//

MachineSchedContext::MachineSchedContext() { … }

MachineSchedContext::~MachineSchedContext() { … }

namespace {

/// Base class for a machine scheduler class that can run at any point.
class MachineSchedulerBase : public MachineSchedContext,
                             public MachineFunctionPass { … };

/// MachineScheduler runs after coalescing and before register allocation.
class MachineScheduler : public MachineSchedulerBase { … };

/// PostMachineScheduler runs after shortly before code emission.
class PostMachineScheduler : public MachineSchedulerBase { … };

} // end anonymous namespace

char MachineScheduler::ID = …;

char &llvm::MachineSchedulerID = …;

INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                      "Machine Instruction Scheduler", false, false)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
                    "Machine Instruction Scheduler", false, false)

MachineScheduler::MachineScheduler() : … { … }

void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { … }

char PostMachineScheduler::ID = …;

char &llvm::PostMachineSchedulerID = …;

INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
                      "PostRA Machine Instruction Scheduler", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
                    "PostRA Machine Instruction Scheduler", false, false)

PostMachineScheduler::PostMachineScheduler() : … { … }

void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { … }

MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
    MachineSchedRegistry::Registry;

/// A dummy default scheduler factory indicates whether the scheduler
/// is overridden on the command line.
static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) { … }

/// MachineSchedOpt allows command line selection of the scheduler.
static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
               RegisterPassParser<MachineSchedRegistry>>
MachineSchedOpt("misched",
                cl::init(&useDefaultMachineSched), cl::Hidden,
                cl::desc("Machine instruction scheduler to use"));

static MachineSchedRegistry
DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
                     useDefaultMachineSched);

static cl::opt<bool> EnableMachineSched(
    "enable-misched",
    cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
    cl::Hidden);

static cl::opt<bool> EnablePostRAMachineSched(
    "enable-post-misched",
    cl::desc("Enable the post-ra machine instruction scheduling pass."),
    cl::init(true), cl::Hidden);

/// Decrement this iterator until reaching the top or a non-debug instr.
static MachineBasicBlock::const_iterator
priorNonDebug(MachineBasicBlock::const_iterator I,
              MachineBasicBlock::const_iterator Beg) { … }

/// Non-const version.
static MachineBasicBlock::iterator
priorNonDebug(MachineBasicBlock::iterator I,
              MachineBasicBlock::const_iterator Beg) { … }

/// If this iterator is a debug value, increment until reaching the End or a
/// non-debug instruction.
static MachineBasicBlock::const_iterator
nextIfDebug(MachineBasicBlock::const_iterator I,
            MachineBasicBlock::const_iterator End) { … }

/// Non-const version.
static MachineBasicBlock::iterator
nextIfDebug(MachineBasicBlock::iterator I,
            MachineBasicBlock::const_iterator End) { … }

/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { … }

/// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
/// the caller. We don't have a command line option to override the postRA
/// scheduler. The Target must configure it.
ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { … }

/// Top-level MachineScheduler pass driver.
///
/// Visit blocks in function order. Divide each block into scheduling regions
/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
/// consistent with the DAG builder, which traverses the interior of the
/// scheduling regions bottom-up.
///
/// This design avoids exposing scheduling boundaries to the DAG builder,
/// simplifying the DAG builder's support for "special" target instructions.
/// At the same time the design allows target schedulers to operate across
/// scheduling boundaries, for example to bundle the boundary instructions
/// without reordering them. This creates complexity, because the target
/// scheduler must update the RegionBegin and RegionEnd positions cached by
/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
/// design would be to split blocks at scheduling boundaries, but LLVM has a
/// general bias against block splitting purely for implementation simplicity.
bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { … }

bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { … }

/// Return true of the given instruction should not be included in a scheduling
/// region.
///
/// MachineScheduler does not currently support scheduling across calls. To
/// handle calls, the DAG builder needs to be modified to create register
/// anti/output dependencies on the registers clobbered by the call's regmask
/// operand. In PreRA scheduling, the stack pointer adjustment already prevents
/// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
/// the boundary, but there would be no benefit to postRA scheduling across
/// calls this late anyway.
static bool isSchedBoundary(MachineBasicBlock::iterator MI,
                            MachineBasicBlock *MBB,
                            MachineFunction *MF,
                            const TargetInstrInfo *TII) { … }

/// A region of an MBB for scheduling.
namespace {
struct SchedRegion { … };
} // end anonymous namespace

MBBRegionsVector;

static void
getSchedRegions(MachineBasicBlock *MBB,
                MBBRegionsVector &Regions,
                bool RegionsTopDown) { … }

/// Main driver for both MachineScheduler and PostMachineScheduler.
void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
                                           bool FixKillFlags) { … }

void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const { … }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void ReadyQueue::dump() const {
  dbgs() << "Queue " << Name << ": ";
  for (const SUnit *SU : Queue)
    dbgs() << SU->NodeNum << " ";
  dbgs() << "\n";
}
#endif

//===----------------------------------------------------------------------===//
// ScheduleDAGMI - Basic machine instruction scheduling. This is
// independent of PreRA/PostRA scheduling and involves no extra book-keeping for
// virtual registers.
// ===----------------------------------------------------------------------===/

// Provide a vtable anchor.
ScheduleDAGMI::~ScheduleDAGMI() = default;

/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
/// NumPredsLeft reaches zero, release the successor node.
///
/// FIXME: Adjust SuccSU height based on MinLatency.
void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) { … }

/// releaseSuccessors - Call releaseSucc on each of SU's successors.
void ScheduleDAGMI::releaseSuccessors(SUnit *SU) { … }

/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
/// NumSuccsLeft reaches zero, release the predecessor node.
///
/// FIXME: Adjust PredSU height based on MinLatency.
void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) { … }

/// releasePredecessors - Call releasePred on each of SU's predecessors.
void ScheduleDAGMI::releasePredecessors(SUnit *SU) { … }

void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) { … }

void ScheduleDAGMI::finishBlock() { … }

/// enterRegion - Called back from PostMachineScheduler::runOnMachineFunction
/// after crossing a scheduling boundary. [begin, end) includes all instructions
/// in the region, including the boundary itself and single-instruction regions
/// that don't get scheduled.
void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
                                     unsigned regioninstrs)
{ … }

/// This is normally called from the main scheduler loop but may also be invoked
/// by the scheduling strategy to perform additional code motion.
void ScheduleDAGMI::moveInstruction(
  MachineInstr *MI, MachineBasicBlock::iterator InsertPos) { … }

bool ScheduleDAGMI::checkSchedLimit() { … }

/// Per-region scheduling driver, called back from
/// PostMachineScheduler::runOnMachineFunction. This is a simplified driver
/// that does not consider liveness or register pressure. It is useful for
/// PostRA scheduling and potentially other custom schedulers.
void ScheduleDAGMI::schedule() { … }

/// Apply each ScheduleDAGMutation step in order.
void ScheduleDAGMI::postProcessDAG() { … }

void ScheduleDAGMI::
findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
                      SmallVectorImpl<SUnit*> &BotRoots) { … }

/// Identify DAG roots and setup scheduler queues.
void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
                               ArrayRef<SUnit*> BotRoots) { … }

/// Update scheduler queues after scheduling an instruction.
void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) { … }

/// Reinsert any remaining debug_values, just like the PostRA scheduler.
void ScheduleDAGMI::placeDebugValues() { … }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
static const char *scheduleTableLegend = "  i: issue\n  x: resource booked";

LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceTopDown() const {
  // Bail off when there is no schedule model to query.
  if (!SchedModel.hasInstrSchedModel())
    return;

  //  Nothing to show if there is no or just one instruction.
  if (BB->size() < 2)
    return;

  dbgs() << " * Schedule table (TopDown):\n";
  dbgs() << scheduleTableLegend << "\n";
  const unsigned FirstCycle = getSUnit(&*(std::begin(*this)))->TopReadyCycle;
  unsigned LastCycle = getSUnit(&*(std::prev(std::end(*this))))->TopReadyCycle;
  for (MachineInstr &MI : *this) {
    SUnit *SU = getSUnit(&MI);
    if (!SU)
      continue;
    const MCSchedClassDesc *SC = getSchedClass(SU);
    for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC),
                                       PE = SchedModel.getWriteProcResEnd(SC);
         PI != PE; ++PI) {
      if (SU->TopReadyCycle + PI->ReleaseAtCycle - 1 > LastCycle)
        LastCycle = SU->TopReadyCycle + PI->ReleaseAtCycle - 1;
    }
  }
  // Print the header with the cycles
  dbgs() << llvm::left_justify("Cycle", HeaderColWidth);
  for (unsigned C = FirstCycle; C <= LastCycle; ++C)
    dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth);
  dbgs() << "|\n";

  for (MachineInstr &MI : *this) {
    SUnit *SU = getSUnit(&MI);
    if (!SU) {
      dbgs() << "Missing SUnit\n";
      continue;
    }
    std::string NodeName("SU(");
    NodeName += std::to_string(SU->NodeNum) + ")";
    dbgs() << llvm::left_justify(NodeName, HeaderColWidth);
    unsigned C = FirstCycle;
    for (; C <= LastCycle; ++C) {
      if (C == SU->TopReadyCycle)
        dbgs() << llvm::left_justify("| i", ColWidth);
      else
        dbgs() << llvm::left_justify("|", ColWidth);
    }
    dbgs() << "|\n";
    const MCSchedClassDesc *SC = getSchedClass(SU);

    SmallVector<MCWriteProcResEntry, 4> ResourcesIt(
        make_range(SchedModel.getWriteProcResBegin(SC),
                   SchedModel.getWriteProcResEnd(SC)));

    if (MISchedSortResourcesInTrace)
      llvm::stable_sort(ResourcesIt,
                        [](const MCWriteProcResEntry &LHS,
                           const MCWriteProcResEntry &RHS) -> bool {
                          return LHS.AcquireAtCycle < RHS.AcquireAtCycle ||
                                 (LHS.AcquireAtCycle == RHS.AcquireAtCycle &&
                                  LHS.ReleaseAtCycle < RHS.ReleaseAtCycle);
                        });
    for (const MCWriteProcResEntry &PI : ResourcesIt) {
      C = FirstCycle;
      const std::string ResName =
          SchedModel.getResourceName(PI.ProcResourceIdx);
      dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth);
      for (; C < SU->TopReadyCycle + PI.AcquireAtCycle; ++C) {
        dbgs() << llvm::left_justify("|", ColWidth);
      }
      for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E;
           ++I, ++C)
        dbgs() << llvm::left_justify("| x", ColWidth);
      while (C++ <= LastCycle)
        dbgs() << llvm::left_justify("|", ColWidth);
      // Place end char
      dbgs() << "| \n";
    }
  }
}

LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const {
  // Bail off when there is no schedule model to query.
  if (!SchedModel.hasInstrSchedModel())
    return;

  //  Nothing to show if there is no or just one instruction.
  if (BB->size() < 2)
    return;

  dbgs() << " * Schedule table (BottomUp):\n";
  dbgs() << scheduleTableLegend << "\n";

  const int FirstCycle = getSUnit(&*(std::begin(*this)))->BotReadyCycle;
  int LastCycle = getSUnit(&*(std::prev(std::end(*this))))->BotReadyCycle;
  for (MachineInstr &MI : *this) {
    SUnit *SU = getSUnit(&MI);
    if (!SU)
      continue;
    const MCSchedClassDesc *SC = getSchedClass(SU);
    for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC),
                                       PE = SchedModel.getWriteProcResEnd(SC);
         PI != PE; ++PI) {
      if ((int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1 < LastCycle)
        LastCycle = (int)SU->BotReadyCycle - PI->ReleaseAtCycle + 1;
    }
  }
  // Print the header with the cycles
  dbgs() << llvm::left_justify("Cycle", HeaderColWidth);
  for (int C = FirstCycle; C >= LastCycle; --C)
    dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth);
  dbgs() << "|\n";

  for (MachineInstr &MI : *this) {
    SUnit *SU = getSUnit(&MI);
    if (!SU) {
      dbgs() << "Missing SUnit\n";
      continue;
    }
    std::string NodeName("SU(");
    NodeName += std::to_string(SU->NodeNum) + ")";
    dbgs() << llvm::left_justify(NodeName, HeaderColWidth);
    int C = FirstCycle;
    for (; C >= LastCycle; --C) {
      if (C == (int)SU->BotReadyCycle)
        dbgs() << llvm::left_justify("| i", ColWidth);
      else
        dbgs() << llvm::left_justify("|", ColWidth);
    }
    dbgs() << "|\n";
    const MCSchedClassDesc *SC = getSchedClass(SU);
    SmallVector<MCWriteProcResEntry, 4> ResourcesIt(
        make_range(SchedModel.getWriteProcResBegin(SC),
                   SchedModel.getWriteProcResEnd(SC)));

    if (MISchedSortResourcesInTrace)
      llvm::stable_sort(ResourcesIt,
                        [](const MCWriteProcResEntry &LHS,
                           const MCWriteProcResEntry &RHS) -> bool {
                          return LHS.AcquireAtCycle < RHS.AcquireAtCycle ||
                                 (LHS.AcquireAtCycle == RHS.AcquireAtCycle &&
                                  LHS.ReleaseAtCycle < RHS.ReleaseAtCycle);
                        });
    for (const MCWriteProcResEntry &PI : ResourcesIt) {
      C = FirstCycle;
      const std::string ResName =
          SchedModel.getResourceName(PI.ProcResourceIdx);
      dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth);
      for (; C > ((int)SU->BotReadyCycle - (int)PI.AcquireAtCycle); --C) {
        dbgs() << llvm::left_justify("|", ColWidth);
      }
      for (unsigned I = 0, E = PI.ReleaseAtCycle - PI.AcquireAtCycle; I != E;
           ++I, --C)
        dbgs() << llvm::left_justify("| x", ColWidth);
      while (C-- >= LastCycle)
        dbgs() << llvm::left_justify("|", ColWidth);
      // Place end char
      dbgs() << "| \n";
    }
  }
}
#endif

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
  if (MISchedDumpScheduleTrace) {
    if (DumpDir == DumpDirection::TopDown)
      dumpScheduleTraceTopDown();
    else if (DumpDir == DumpDirection::BottomUp)
      dumpScheduleTraceBottomUp();
    else if (DumpDir == DumpDirection::Bidirectional) {
      dbgs() << "* Schedule table (Bidirectional): not implemented\n";
    } else {
      dbgs() << "* Schedule table: DumpDirection not set.\n";
    }
  }

  for (MachineInstr &MI : *this) {
    if (SUnit *SU = getSUnit(&MI))
      dumpNode(*SU);
    else
      dbgs() << "Missing SUnit\n";
  }
}
#endif

//===----------------------------------------------------------------------===//
// ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
// preservation.
//===----------------------------------------------------------------------===//

ScheduleDAGMILive::~ScheduleDAGMILive() { … }

void ScheduleDAGMILive::collectVRegUses(SUnit &SU) { … }

/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
/// crossing a scheduling boundary. [begin, end) includes all instructions in
/// the region, including the boundary itself and single-instruction regions
/// that don't get scheduled.
void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
                                MachineBasicBlock::iterator begin,
                                MachineBasicBlock::iterator end,
                                unsigned regioninstrs)
{ … }

// Setup the register pressure trackers for the top scheduled and bottom
// scheduled regions.
void ScheduleDAGMILive::initRegPressure() { … }

void ScheduleDAGMILive::
updateScheduledPressure(const SUnit *SU,
                        const std::vector<unsigned> &NewMaxPressure) { … }

/// Update the PressureDiff array for liveness after scheduling this
/// instruction.
void ScheduleDAGMILive::updatePressureDiffs(
    ArrayRef<RegisterMaskPair> LiveUses) { … }

void ScheduleDAGMILive::dump() const { … }

/// schedule - Called back from MachineScheduler::runOnMachineFunction
/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
/// only includes instructions that have DAG nodes, not scheduling boundaries.
///
/// This is a skeletal driver, with all the functionality pushed into helpers,
/// so that it can be easily extended by experimental schedulers. Generally,
/// implementing MachineSchedStrategy should be sufficient to implement a new
/// scheduling algorithm. However, if a scheduler further subclasses
/// ScheduleDAGMILive then it will want to override this virtual method in order
/// to update any specialized state.
void ScheduleDAGMILive::schedule() { … }

/// Build the DAG and setup three register pressure trackers.
void ScheduleDAGMILive::buildDAGWithRegPressure() { … }

void ScheduleDAGMILive::computeDFSResult() { … }

/// Compute the max cyclic critical path through the DAG. The scheduling DAG
/// only provides the critical path for single block loops. To handle loops that
/// span blocks, we could use the vreg path latencies provided by
/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
/// available for use in the scheduler.
///
/// The cyclic path estimation identifies a def-use pair that crosses the back
/// edge and considers the depth and height of the nodes. For example, consider
/// the following instruction sequence where each instruction has unit latency
/// and defines an eponymous virtual register:
///
/// a->b(a,c)->c(b)->d(c)->exit
///
/// The cyclic critical path is a two cycles: b->c->b
/// The acyclic critical path is four cycles: a->b->c->d->exit
/// LiveOutHeight = height(c) = len(c->d->exit) = 2
/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
/// LiveInDepth = depth(b) = len(a->b) = 1
///
/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
/// CyclicCriticalPath = min(2, 2) = 2
///
/// This could be relevant to PostRA scheduling, but is currently implemented
/// assuming LiveIntervals.
unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { … }

/// Release ExitSU predecessors and setup scheduler queues. Re-position
/// the Top RP tracker in case the region beginning has changed.
void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
                                   ArrayRef<SUnit*> BotRoots) { … }

/// Move an instruction and update register pressure.
void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { … }

//===----------------------------------------------------------------------===//
// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
//===----------------------------------------------------------------------===//

namespace {

/// Post-process the DAG to create cluster edges between neighboring
/// loads or between neighboring stores.
class BaseMemOpClusterMutation : public ScheduleDAGMutation { … };

class StoreClusterMutation : public BaseMemOpClusterMutation { … };

class LoadClusterMutation : public BaseMemOpClusterMutation { … };

} // end anonymous namespace

namespace llvm {

std::unique_ptr<ScheduleDAGMutation>
createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                             const TargetRegisterInfo *TRI,
                             bool ReorderWhileClustering) { … }

std::unique_ptr<ScheduleDAGMutation>
createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI,
                              bool ReorderWhileClustering) { … }

} // end namespace llvm

// Sorting all the loads/stores first, then for each load/store, checking the
// following load/store one by one, until reach the first non-dependent one and
// call target hook to see if they can cluster.
// If FastCluster is enabled, we assume that, all the loads/stores have been
// preprocessed and now, they didn't have dependencies on each other.
void BaseMemOpClusterMutation::clusterNeighboringMemOps(
    ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
    ScheduleDAGInstrs *DAG) { … }

void BaseMemOpClusterMutation::collectMemOpRecords(
    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) { … }

bool BaseMemOpClusterMutation::groupMemOps(
    ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) { … }

/// Callback from DAG postProcessing to create cluster edges for loads/stores.
void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { … }

//===----------------------------------------------------------------------===//
// CopyConstrain - DAG post-processing to encourage copy elimination.
//===----------------------------------------------------------------------===//

namespace {

/// Post-process the DAG to create weak edges from all uses of a copy to
/// the one use that defines the copy's source vreg, most likely an induction
/// variable increment.
class CopyConstrain : public ScheduleDAGMutation { … };

} // end anonymous namespace

namespace llvm {

std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) { … }

} // end namespace llvm

/// constrainLocalCopy handles two possibilities:
/// 1) Local src:
/// I0:     = dst
/// I1: src = ...
/// I2:     = dst
/// I3: dst = src (copy)
/// (create pred->succ edges I0->I1, I2->I1)
///
/// 2) Local copy:
/// I0: dst = src (copy)
/// I1:     = dst
/// I2: src = ...
/// I3:     = dst
/// (create pred->succ edges I1->I2, I3->I2)
///
/// Although the MachineScheduler is currently constrained to single blocks,
/// this algorithm should handle extended blocks. An EBB is a set of
/// contiguously numbered blocks such that the previous block in the EBB is
/// always the single predecessor.
void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { … }

/// Callback from DAG postProcessing to create weak edges to encourage
/// copy elimination.
void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { … }

//===----------------------------------------------------------------------===//
// MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
// and possibly other custom schedulers.
//===----------------------------------------------------------------------===//

static const unsigned InvalidCycle = …;

SchedBoundary::~SchedBoundary() { … }

/// Given a Count of resource usage and a Latency value, return true if a
/// SchedBoundary becomes resource limited.
/// If we are checking after scheduling a node, we should return true when
/// we just reach the resource limit.
static bool checkResourceLimit(unsigned LFactor, unsigned Count,
                               unsigned Latency, bool AfterSchedNode) { … }

void SchedBoundary::reset() { … }

void SchedRemainder::
init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { … }

void SchedBoundary::
init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { … }

/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
/// these "soft stalls" differently than the hard stall cycles based on CPU
/// resources and computed by checkHazard(). A fully in-order model
/// (MicroOpBufferSize==0) will not make use of this since instructions are not
/// available for scheduling until they are ready. However, a weaker in-order
/// model may use this for heuristics. For example, if a processor has in-order
/// behavior when reading certain resources, this may come into play.
unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) { … }

/// Compute the next cycle at which the given processor resource unit
/// can be scheduled.
unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
                                                       unsigned ReleaseAtCycle,
                                                       unsigned AcquireAtCycle) { … }

/// Compute the next cycle at which the given processor resource can be
/// scheduled.  Returns the next cycle and the index of the processor resource
/// instance in the reserved cycles vector.
std::pair<unsigned, unsigned>
SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
                                    unsigned ReleaseAtCycle,
                                    unsigned AcquireAtCycle) { … }

/// Does this SU have a hazard within the current instruction group.
///
/// The scheduler supports two modes of hazard recognition. The first is the
/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
/// supports highly complicated in-order reservation tables
/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
///
/// The second is a streamlined mechanism that checks for hazards based on
/// simple counters that the scheduler itself maintains. It explicitly checks
/// for instruction dispatch limitations, including the number of micro-ops that
/// can dispatch per cycle.
///
/// TODO: Also check whether the SU must start a new group.
bool SchedBoundary::checkHazard(SUnit *SU) { … }

// Find the unscheduled node in ReadySUs with the highest latency.
unsigned SchedBoundary::
findMaxLatency(ArrayRef<SUnit*> ReadySUs) { … }

// Count resources in this zone and the remaining unscheduled
// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
// resource index, or zero if the zone is issue limited.
unsigned SchedBoundary::
getOtherResourceCount(unsigned &OtherCritIdx) { … }

void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
                                unsigned Idx) { … }

/// Move the boundary of scheduled code by one cycle.
void SchedBoundary::bumpCycle(unsigned NextCycle) { … }

void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) { … }

/// Add the given processor resource to this scheduled zone.
///
/// \param ReleaseAtCycle indicates the number of consecutive (non-pipelined)
/// cycles during which this resource is released.
///
/// \param AcquireAtCycle indicates the number of consecutive (non-pipelined)
/// cycles at which the resource is aquired after issue (assuming no stalls).
///
/// \return the next cycle at which the instruction may execute without
/// oversubscribing resources.
unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
                                      unsigned ReleaseAtCycle,
                                      unsigned NextCycle,
                                      unsigned AcquireAtCycle) { … }

/// Move the boundary of scheduled code by one SUnit.
void SchedBoundary::bumpNode(SUnit *SU) { … }

/// Release pending ready nodes in to the available queue. This makes them
/// visible to heuristics.
void SchedBoundary::releasePending() { … }

/// Remove SU from the ready set for this boundary.
void SchedBoundary::removeReady(SUnit *SU) { … }

/// If this queue only has one ready candidate, return it. As a side effect,
/// defer any nodes that now hit a hazard, and advance the cycle until at least
/// one node is ready. If multiple instructions are ready, return NULL.
SUnit *SchedBoundary::pickOnlyChoice() { … }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

/// Dump the content of the \ref ReservedCycles vector for the
/// resources that are used in the basic block.
///
LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const {
  if (!SchedModel->hasInstrSchedModel())
    return;

  unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
  unsigned StartIdx = 0;

  for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) {
    const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits;
    std::string ResName = SchedModel->getResourceName(ResIdx);
    for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) {
      dbgs() << ResName << "(" << UnitIdx << ") = ";
      if (SchedModel && SchedModel->enableIntervals()) {
        if (ReservedResourceSegments.count(StartIdx + UnitIdx))
          dbgs() << ReservedResourceSegments.at(StartIdx + UnitIdx);
        else
          dbgs() << "{ }\n";
      } else
        dbgs() << ReservedCycles[StartIdx + UnitIdx] << "\n";
    }
    StartIdx += NumUnits;
  }
}

// This is useful information to dump after bumpNode.
// Note that the Queue contents are more useful before pickNodeFromQueue.
LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
  unsigned ResFactor;
  unsigned ResCount;
  if (ZoneCritResIdx) {
    ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
    ResCount = getResourceCount(ZoneCritResIdx);
  } else {
    ResFactor = SchedModel->getMicroOpFactor();
    ResCount = RetiredMOps * ResFactor;
  }
  unsigned LFactor = SchedModel->getLatencyFactor();
  dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
         << "  Retired: " << RetiredMOps;
  dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
  dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
         << ResCount / ResFactor << " "
         << SchedModel->getResourceName(ZoneCritResIdx)
         << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
         << (IsResourceLimited ? "  - Resource" : "  - Latency")
         << " limited.\n";
  if (MISchedDumpReservedCycles)
    dumpReservedCycles();
}
#endif

//===----------------------------------------------------------------------===//
// GenericScheduler - Generic implementation of MachineSchedStrategy.
//===----------------------------------------------------------------------===//

void GenericSchedulerBase::SchedCandidate::
initResourceDelta(const ScheduleDAGMI *DAG,
                  const TargetSchedModel *SchedModel) { … }

/// Compute remaining latency. We need this both to determine whether the
/// overall schedule has become latency-limited and whether the instructions
/// outside this zone are resource or latency limited.
///
/// The "dependent" latency is updated incrementally during scheduling as the
/// max height/depth of scheduled nodes minus the cycles since it was
/// scheduled:
///   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
///
/// The "independent" latency is the max ready queue depth:
///   ILat = max N.depth for N in Available|Pending
///
/// RemainingLatency is the greater of independent and dependent latency.
///
/// These computations are expensive, especially in DAGs with many edges, so
/// only do them if necessary.
static unsigned computeRemLatency(SchedBoundary &CurrZone) { … }

/// Returns true if the current cycle plus remaning latency is greater than
/// the critical path in the scheduling region.
bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
                                               SchedBoundary &CurrZone,
                                               bool ComputeRemLatency,
                                               unsigned &RemLatency) const { … }

/// Set the CandPolicy given a scheduling zone given the current resources and
/// latencies inside and outside the zone.
void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
                                     SchedBoundary &CurrZone,
                                     SchedBoundary *OtherZone) { … }

#ifndef NDEBUG
const char *GenericSchedulerBase::getReasonStr(
  GenericSchedulerBase::CandReason Reason) {
  switch (Reason) {
  case NoCand:         return "NOCAND    ";
  case Only1:          return "ONLY1     ";
  case PhysReg:        return "PHYS-REG  ";
  case RegExcess:      return "REG-EXCESS";
  case RegCritical:    return "REG-CRIT  ";
  case Stall:          return "STALL     ";
  case Cluster:        return "CLUSTER   ";
  case Weak:           return "WEAK      ";
  case RegMax:         return "REG-MAX   ";
  case ResourceReduce: return "RES-REDUCE";
  case ResourceDemand: return "RES-DEMAND";
  case TopDepthReduce: return "TOP-DEPTH ";
  case TopPathReduce:  return "TOP-PATH  ";
  case BotHeightReduce:return "BOT-HEIGHT";
  case BotPathReduce:  return "BOT-PATH  ";
  case NextDefUse:     return "DEF-USE   ";
  case NodeOrder:      return "ORDER     ";
  };
  llvm_unreachable("Unknown reason!");
}

void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
  PressureChange P;
  unsigned ResIdx = 0;
  unsigned Latency = 0;
  switch (Cand.Reason) {
  default:
    break;
  case RegExcess:
    P = Cand.RPDelta.Excess;
    break;
  case RegCritical:
    P = Cand.RPDelta.CriticalMax;
    break;
  case RegMax:
    P = Cand.RPDelta.CurrentMax;
    break;
  case ResourceReduce:
    ResIdx = Cand.Policy.ReduceResIdx;
    break;
  case ResourceDemand:
    ResIdx = Cand.Policy.DemandResIdx;
    break;
  case TopDepthReduce:
    Latency = Cand.SU->getDepth();
    break;
  case TopPathReduce:
    Latency = Cand.SU->getHeight();
    break;
  case BotHeightReduce:
    Latency = Cand.SU->getHeight();
    break;
  case BotPathReduce:
    Latency = Cand.SU->getDepth();
    break;
  }
  dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
  if (P.isValid())
    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
           << ":" << P.getUnitInc() << " ";
  else
    dbgs() << "      ";
  if (ResIdx)
    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
  else
    dbgs() << "         ";
  if (Latency)
    dbgs() << " " << Latency << " cycles ";
  else
    dbgs() << "          ";
  dbgs() << '\n';
}
#endif

namespace llvm {
/// Return true if this heuristic determines order.
/// TODO: Consider refactor return type of these functions as integer or enum,
/// as we may need to differentiate whether TryCand is better than Cand.
bool tryLess(int TryVal, int CandVal,
             GenericSchedulerBase::SchedCandidate &TryCand,
             GenericSchedulerBase::SchedCandidate &Cand,
             GenericSchedulerBase::CandReason Reason) { … }

bool tryGreater(int TryVal, int CandVal,
                GenericSchedulerBase::SchedCandidate &TryCand,
                GenericSchedulerBase::SchedCandidate &Cand,
                GenericSchedulerBase::CandReason Reason) { … }

bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
                GenericSchedulerBase::SchedCandidate &Cand,
                SchedBoundary &Zone) { … }
} // end namespace llvm

static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { … }

static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { … }

void GenericScheduler::initialize(ScheduleDAGMI *dag) { … }

/// Initialize the per-region scheduling policy.
void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
                                  MachineBasicBlock::iterator End,
                                  unsigned NumRegionInstrs) { … }

void GenericScheduler::dumpPolicy() const { … }

/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
/// critical path by more cycles than it takes to drain the instruction buffer.
/// We estimate an upper bounds on in-flight instructions as:
///
/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
/// InFlightIterations = AcyclicPath / CyclesPerIteration
/// InFlightResources = InFlightIterations * LoopResources
///
/// TODO: Check execution resources in addition to IssueCount.
void GenericScheduler::checkAcyclicLatency() { … }

void GenericScheduler::registerRoots() { … }

namespace llvm {
bool tryPressure(const PressureChange &TryP,
                 const PressureChange &CandP,
                 GenericSchedulerBase::SchedCandidate &TryCand,
                 GenericSchedulerBase::SchedCandidate &Cand,
                 GenericSchedulerBase::CandReason Reason,
                 const TargetRegisterInfo *TRI,
                 const MachineFunction &MF) { … }

unsigned getWeakLeft(const SUnit *SU, bool isTop) { … }

/// Minimize physical register live ranges. Regalloc wants them adjacent to
/// their physreg def/use.
///
/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
/// with the operation that produces or consumes the physreg. We'll do this when
/// regalloc has support for parallel copies.
int biasPhysReg(const SUnit *SU, bool isTop) { … }
} // end namespace llvm

void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                     bool AtTop,
                                     const RegPressureTracker &RPTracker,
                                     RegPressureTracker &TempTracker) { … }

/// Apply a set of heuristics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the
/// queue. But it's really done to make the heuristics easier to debug and
/// statistically analyze.
///
/// \param Cand provides the policy and current best candidate.
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
/// \param Zone describes the scheduled zone that we are extending, or nullptr
///             if Cand is from a different zone than TryCand.
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                    SchedCandidate &TryCand,
                                    SchedBoundary *Zone) const { … }

/// Pick the best candidate from the queue.
///
/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
/// DAG building. To adjust for the current scheduling location we need to
/// maintain the number of vreg uses remaining to be top-scheduled.
void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
                                         const CandPolicy &ZonePolicy,
                                         const RegPressureTracker &RPTracker,
                                         SchedCandidate &Cand) { … }

/// Pick the best candidate node from either the top or bottom queue.
SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { … }

/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
SUnit *GenericScheduler::pickNode(bool &IsTopNode) { … }

void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) { … }

/// Update the scheduler's state after scheduling a node. This is the same node
/// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
/// update it's state based on the current cycle before MachineSchedStrategy
/// does.
///
/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
/// them here. See comments in biasPhysReg.
void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { … }

/// Create the standard converging machine scheduler. This will be used as the
/// default scheduler if the target does not set a default.
ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) { … }

static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) { … }

static MachineSchedRegistry
GenericSchedRegistry("converge", "Standard converging scheduler.",
                     createConvergingSched);

//===----------------------------------------------------------------------===//
// PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
//===----------------------------------------------------------------------===//

void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) { … }

void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
                                      MachineBasicBlock::iterator End,
                                      unsigned NumRegionInstrs) { … }

void PostGenericScheduler::registerRoots() { … }

/// Apply a set of heuristics to a new candidate for PostRA scheduling.
///
/// \param Cand provides the policy and current best candidate.
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
                                        SchedCandidate &TryCand) { … }

void PostGenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
                                             SchedCandidate &Cand) { … }

/// Pick the best candidate node from either the top or bottom queue.
SUnit *PostGenericScheduler::pickNodeBidirectional(bool &IsTopNode) { … }

/// Pick the next node to schedule.
SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { … }

/// Called after ScheduleDAGMI has scheduled an instruction and updated
/// scheduled/remaining flags in the DAG nodes.
void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { … }

ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) { … }

//===----------------------------------------------------------------------===//
// ILP Scheduler. Currently for experimental analysis of heuristics.
//===----------------------------------------------------------------------===//

namespace {

/// Order nodes by the ILP metric.
struct ILPOrder { … };

/// Schedule based on the ILP metric.
class ILPScheduler : public MachineSchedStrategy { … };

} // end anonymous namespace

static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) { … }
static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) { … }

static MachineSchedRegistry ILPMaxRegistry(
  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
static MachineSchedRegistry ILPMinRegistry(
  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);

//===----------------------------------------------------------------------===//
// Machine Instruction Shuffler for Correctness Testing
//===----------------------------------------------------------------------===//

#ifndef NDEBUG
namespace {

/// Apply a less-than relation on the node order, which corresponds to the
/// instruction order prior to scheduling. IsReverse implements greater-than.
template<bool IsReverse>
struct SUnitOrder {
  bool operator()(SUnit *A, SUnit *B) const {
    if (IsReverse)
      return A->NodeNum > B->NodeNum;
    else
      return A->NodeNum < B->NodeNum;
  }
};

/// Reorder instructions as much as possible.
class InstructionShuffler : public MachineSchedStrategy {
  bool IsAlternating;
  bool IsTopDown;

  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
  // gives nodes with a higher number higher priority causing the latest
  // instructions to be scheduled first.
  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
    TopQ;

  // When scheduling bottom-up, use greater-than as the queue priority.
  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
    BottomQ;

public:
  InstructionShuffler(bool alternate, bool topdown)
    : IsAlternating(alternate), IsTopDown(topdown) {}

  void initialize(ScheduleDAGMI*) override {
    TopQ.clear();
    BottomQ.clear();
  }

  /// Implement MachineSchedStrategy interface.
  /// -----------------------------------------

  SUnit *pickNode(bool &IsTopNode) override {
    SUnit *SU;
    if (IsTopDown) {
      do {
        if (TopQ.empty()) return nullptr;
        SU = TopQ.top();
        TopQ.pop();
      } while (SU->isScheduled);
      IsTopNode = true;
    } else {
      do {
        if (BottomQ.empty()) return nullptr;
        SU = BottomQ.top();
        BottomQ.pop();
      } while (SU->isScheduled);
      IsTopNode = false;
    }
    if (IsAlternating)
      IsTopDown = !IsTopDown;
    return SU;
  }

  void schedNode(SUnit *SU, bool IsTopNode) override {}

  void releaseTopNode(SUnit *SU) override {
    TopQ.push(SU);
  }
  void releaseBottomNode(SUnit *SU) override {
    BottomQ.push(SU);
  }
};

} // end anonymous namespace

static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
  bool Alternate = !ForceTopDown && !ForceBottomUp;
  bool TopDown = !ForceBottomUp;
  assert((TopDown || !ForceTopDown) &&
         "-misched-topdown incompatible with -misched-bottomup");
  return new ScheduleDAGMILive(
      C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
}

static MachineSchedRegistry ShufflerRegistry(
  "shuffle", "Shuffle machine instructions alternating directions",
  createInstructionShuffler);
#endif // !NDEBUG

//===----------------------------------------------------------------------===//
// GraphWriter support for ScheduleDAGMILive.
//===----------------------------------------------------------------------===//

#ifndef NDEBUG
namespace llvm {

template<> struct GraphTraits<
  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};

template<>
struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

  static std::string getGraphName(const ScheduleDAG *G) {
    return std::string(G->MF.getName());
  }

  static bool renderGraphFromBottomUp() {
    return true;
  }

  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
    if (ViewMISchedCutoff == 0)
      return false;
    return (Node->Preds.size() > ViewMISchedCutoff
         || Node->Succs.size() > ViewMISchedCutoff);
  }

  /// If you want to override the dot attributes printed for a particular
  /// edge, override this method.
  static std::string getEdgeAttributes(const SUnit *Node,
                                       SUnitIterator EI,
                                       const ScheduleDAG *Graph) {
    if (EI.isArtificialDep())
      return "color=cyan,style=dashed";
    if (EI.isCtrlDep())
      return "color=blue,style=dashed";
    return "";
  }

  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
    std::string Str;
    raw_string_ostream SS(Str);
    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
    SS << "SU:" << SU->NodeNum;
    if (DFS)
      SS << " I:" << DFS->getNumInstrs(SU);
    return Str;
  }

  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
    return G->getGraphNodeLabel(SU);
  }

  static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
    std::string Str("shape=Mrecord");
    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
    if (DFS) {
      Str += ",style=filled,fillcolor=\"#";
      Str += DOT::getColorString(DFS->getSubtreeID(N));
      Str += '"';
    }
    return Str;
  }
};

} // end namespace llvm
#endif // NDEBUG

/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
/// rendered using 'dot'.
void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { … }

/// Out-of-line implementation with no arguments is handy for gdb.
void ScheduleDAGMI::viewGraph() { … }

/// Sort predicate for the intervals stored in an instance of
/// ResourceSegments. Intervals are always disjoint (no intersection
/// for any pairs of intervals), therefore we can sort the totality of
/// the intervals by looking only at the left boundary.
static bool sortIntervals(const ResourceSegments::IntervalTy &A,
                          const ResourceSegments::IntervalTy &B) { … }

unsigned ResourceSegments::getFirstAvailableAt(
    unsigned CurrCycle, unsigned AcquireAtCycle, unsigned ReleaseAtCycle,
    std::function<ResourceSegments::IntervalTy(unsigned, unsigned, unsigned)>
        IntervalBuilder) const { … }

void ResourceSegments::add(ResourceSegments::IntervalTy A,
                           const unsigned CutOff) { … }

bool ResourceSegments::intersects(ResourceSegments::IntervalTy A,
                                  ResourceSegments::IntervalTy B) { … }

void ResourceSegments::sortAndMerge() { … }
llvm/llvm/lib/CodeGen/MachineScheduler.cpp