AMDGPUIGroupLP.cpp | Explore in Territory

//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file This file defines a set of schedule DAG mutations that can be used to
// override default scheduler behavior to enforce specific scheduling patterns.
// They should be used in cases where runtime performance considerations such as
// inter-wavefront interactions, mean that compile-time heuristics cannot
// predict the optimal instruction ordering, or in kernels where optimum
// instruction scheduling is important enough to warrant manual intervention.
//
//===----------------------------------------------------------------------===//

#include "AMDGPUIGroupLP.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"

usingnamespacellvm;

#define DEBUG_TYPE …

namespace {

static cl::opt<bool> EnableExactSolver(
    "amdgpu-igrouplp-exact-solver", cl::Hidden,
    cl::desc("Whether to use the exponential time solver to fit "
             "the instructions to the pipeline as closely as "
             "possible."),
    cl::init(false));

static cl::opt<unsigned> CutoffForExact(
    "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
    cl::desc("The maximum number of scheduling group conflicts "
             "which we attempt to solve with the exponential time "
             "exact solver. Problem sizes greater than this will"
             "be solved by the less accurate greedy algorithm. Selecting "
             "solver by size is superseded by manually selecting "
             "the solver (e.g. by amdgpu-igrouplp-exact-solver"));

static cl::opt<uint64_t> MaxBranchesExplored(
    "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
    cl::desc("The amount of branches that we are willing to explore with"
             "the exact algorithm before giving up."));

static cl::opt<bool> UseCostHeur(
    "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
    cl::desc("Whether to use the cost heuristic to make choices as we "
             "traverse the search space using the exact solver. Defaulted "
             "to on, and if turned off, we will use the node order -- "
             "attempting to put the later nodes in the later sched groups. "
             "Experimentally, results are mixed, so this should be set on a "
             "case-by-case basis."));

// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask { … };

class SchedGroup;

// InstructionRule class is used to enact a filter which determines whether or
// not an SU maps to a given SchedGroup. It contains complementary data
// structures (e.g Cache) to help those filters.
class InstructionRule { … };

SUnitsToCandidateSGsMap;

// Classify instructions into groups to enable fine tuned control over the
// scheduler. These groups may be more specific than current SchedModel
// instruction classes.
class SchedGroup { … };

// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { … }

SUToCandSGsPair;
SUsToCandSGsVec;

// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
// in non-trivial cases. For example, if the requested pipeline is
// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
// in the DAG, then we will have an instruction that can not be trivially
// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
// to find a good solution to the pipeline -- a greedy algorithm and an exact
// algorithm. The exact algorithm has an exponential time complexity and should
// only be used for small sized problems or medium sized problems where an exact
// solution is highly desired.
class PipelineSolver { … };

void PipelineSolver::reset() { … }

void PipelineSolver::convertSyncMapsToArrays() { … }

template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { … }

void PipelineSolver::makePipeline() { … }

template <typename T>
int PipelineSolver::linkSUnit(
    SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
    T I, T E) { … }

int PipelineSolver::addEdges(
    SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { … }

void PipelineSolver::removeEdges(
    const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { … }

void PipelineSolver::advancePosition() { … }

void PipelineSolver::retreatPosition() { … }

bool PipelineSolver::checkOptimal() { … }

template <typename T>
void PipelineSolver::populateReadyList(
    SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { … }

bool PipelineSolver::solveExact() { … }

template <typename T>
void PipelineSolver::greedyFind(
    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { … }

bool PipelineSolver::solveGreedy() { … }

unsigned PipelineSolver::computeProblemSize() { … }

void PipelineSolver::solve() { … }

enum IGLPStrategyID : int { … };

// Implement a IGLP scheduling strategy.
class IGLPStrategy { … };

class MFMASmallGemmOpt final : public IGLPStrategy { … };

bool MFMASmallGemmOpt::applyIGLPStrategy(
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
    AMDGPU::SchedulingPhase Phase) { … }

class MFMAExpInterleaveOpt final : public IGLPStrategy { … };

unsigned MFMAExpInterleaveOpt::TransPipeCount = …;
unsigned MFMAExpInterleaveOpt::MFMAPipeCount = …;
unsigned MFMAExpInterleaveOpt::AddPipeCount = …;
unsigned MFMAExpInterleaveOpt::MFMAEnablement = …;
unsigned MFMAExpInterleaveOpt::ExpRequirement = …;
unsigned MFMAExpInterleaveOpt::MFMAChains = …;
unsigned MFMAExpInterleaveOpt::MFMAChainLength = …;
bool MFMAExpInterleaveOpt::HasCvt = …;
bool MFMAExpInterleaveOpt::HasChainBetweenCvt = …;
std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = …;

bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { … }

bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                               AMDGPU::SchedulingPhase Phase) { … }

bool MFMAExpInterleaveOpt::applyIGLPStrategy(
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
    AMDGPU::SchedulingPhase Phase) { … }

class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { … };

static unsigned DSWCount = …;
static unsigned DSWWithPermCount = …;
static unsigned DSWWithSharedVMEMCount = …;

bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
    AMDGPU::SchedulingPhase Phase) { … }

static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
                   const SIInstrInfo *TII) { … }

class IGroupLPDAGMutation : public ScheduleDAGMutation { … };

unsigned SchedGroup::NumSchedGroups = …;

bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { … }

bool SchedGroup::canAddMI(const MachineInstr &MI) const { … }

int SchedGroup::link(SUnit &SU, bool MakePred,
                     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { … }

void SchedGroup::link(SUnit &SU, bool MakePred) { … }

void SchedGroup::link(SUnit &SU,
                      function_ref<bool(const SUnit *A, const SUnit *B)> P) { … }

void SchedGroup::link(SchedGroup &OtherGroup) { … }

bool SchedGroup::canAddSU(SUnit &SU) const { … }

void SchedGroup::initSchedGroup() { … }

void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
                                SUnitsToCandidateSGsMap &SyncedInstrs) { … }

void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { … }

void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { … }

void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { … }

SchedGroupMask
IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { … }

void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
    std::vector<SUnit>::reverse_iterator RIter) { … }

bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { … }

} // namespace

namespace llvm {

/// \p Phase specifes whether or not this is a reentry into the
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
/// scheduling "phases"), we can reenter this mutation framework more than once
/// for a given region.
std::unique_ptr<ScheduleDAGMutation>
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { … }

} // end namespace llvm
llvm/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp