AMDGPUSplitModule.cpp | Explore in Territory

//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file Implements a module splitting algorithm designed to support the
/// FullLTO --lto-partitions option for parallel codegen.
///
/// The role of this module splitting pass is the same as
/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
/// across a set of N partitions to allow for parallel codegen.
///
/// The similarities mostly end here, as this pass achieves load-balancing in a
/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
/// advantage of the structure of AMDGPU modules (which are mostly
/// self-contained) to allow for more efficient splitting without affecting
/// codegen negatively, or causing innaccurate resource usage analysis.
///
/// High-level pass overview:
///   - SplitGraph & associated classes
///      - Graph representation of the module and of the dependencies that
///      matter for splitting.
///   - RecursiveSearchSplitting
///     - Core splitting algorithm.
///   - SplitProposal
///     - Represents a suggested solution for splitting the input module. These
///     solutions can be scored to determine the best one when multiple
///     solutions are available.
///   - Driver/pass "run" function glues everything together.

#include "AMDGPUSplitModule.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iterator>
#include <memory>
#include <utility>
#include <vector>

#ifndef NDEBUG
#include "llvm/Support/LockFileManager.h"
#endif

#define DEBUG_TYPE …

namespace llvm {
namespace {

static cl::opt<unsigned> MaxDepth(
    "amdgpu-module-splitting-max-depth",
    cl::desc(
        "maximum search depth. 0 forces a greedy approach. "
        "warning: the algorithm is up to O(2^N), where N is the max depth."),
    cl::init(8));

static cl::opt<float> LargeFnFactor(
    "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
    cl::desc(
        "when max depth is reached and we can no longer branch out, this "
        "value determines if a function is worth merging into an already "
        "existing partition to reduce code duplication. This is a factor "
        "of the ideal partition size, e.g. 2.0 means we consider the "
        "function for merging if its cost (including its callees) is 2x the "
        "size of an ideal partition."));

static cl::opt<float> LargeFnOverlapForMerge(
    "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
    cl::desc("when a function is considered for merging into a partition that "
             "already contains some of its callees, do the merge if at least "
             "n% of the code it can reach is already present inside the "
             "partition; e.g. 0.7 means only merge >70%"));

static cl::opt<bool> NoExternalizeGlobals(
    "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
    cl::desc("disables externalization of global variable with local linkage; "
             "may cause globals to be duplicated which increases binary size"));

static cl::opt<std::string>
    ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
                       cl::Hidden,
                       cl::desc("output file to write out the dotgraph "
                                "representation of the input module"));

static cl::opt<std::string> PartitionSummariesOutput(
    "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
    cl::desc("output file to write out a summary of "
             "the partitions created for each module"));

#ifndef NDEBUG
static cl::opt<bool>
    UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
                cl::desc("use a lock file so only one process in the system "
                         "can run this pass at once. useful to avoid mangled "
                         "debug output in multithreaded environments."));

static cl::opt<bool>
    DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
                        cl::Hidden,
                        cl::desc("print all proposals received and whether "
                                 "they were rejected or accepted"));
#endif

struct SplitModuleTimer : NamedRegionTimer { … };

//===----------------------------------------------------------------------===//
// Utils
//===----------------------------------------------------------------------===//

CostType;
FunctionsCostMap;
GetTTIFn;
static constexpr unsigned InvalidPID = …;

/// \param Num numerator
/// \param Dem denominator
/// \returns a printable object to print (Num/Dem) using "%0.2f".
static auto formatRatioOf(CostType Num, CostType Dem) { … }

/// Checks whether a given function is non-copyable.
///
/// Non-copyable functions cannot be cloned into multiple partitions, and only
/// one copy of the function can be present across all partitions.
///
/// External functions fall into this category. If we were to clone them, we
/// would end up with multiple symbol definitions and a very unhappy linker.
static bool isNonCopyable(const Function &F) { … }

/// If \p GV has local linkage, make it external + hidden.
static void externalize(GlobalValue &GV) { … }

/// Cost analysis function. Calculates the cost of each function in \p M
///
/// \param GetTTI Abstract getter for TargetTransformInfo.
/// \param M Module to analyze.
/// \param CostMap[out] Resulting Function -> Cost map.
/// \return The module's total cost.
static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
                                       FunctionsCostMap &CostMap) { … }

/// \return true if \p F can be indirectly called
static bool canBeIndirectlyCalled(const Function &F) { … }

//===----------------------------------------------------------------------===//
// Graph-based Module Representation
//===----------------------------------------------------------------------===//

/// AMDGPUSplitModule's view of the source Module, as a graph of all components
/// that can be split into different modules.
///
/// The most trivial instance of this graph is just the CallGraph of the module,
/// but it is not guaranteed that the graph is strictly equal to the CG. It
/// currently always is but it's designed in a way that would eventually allow
/// us to create abstract nodes, or nodes for different entities such as global
/// variables or any other meaningful constraint we must consider.
///
/// The graph is only mutable by this class, and is generally not modified
/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
/// mutate it.
class SplitGraph { … };

/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
/// Incoming edges have this node as their Dst, and Outgoing ones have this node
/// as their Src.
///
/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
/// feedback on how two nodes are related, and in which direction they are
/// related, which is valuable information to make splitting decisions.
///
/// Nodes are fundamentally abstract, and any consumers of the graph should
/// treat them as such. While a node will be a function most of the time, we
/// could also create nodes for any other reason. In the future, we could have
/// single nodes for multiple functions, or nodes for GVs, etc.
class SplitGraph::Node { … };

void SplitGraph::Node::visitAllDependencies(
    std::function<void(const Node &)> Visitor) const { … }

void SplitGraph::buildGraph(CallGraph &CG) { … }

#ifndef NDEBUG
bool SplitGraph::verifyGraph() const {
  unsigned ExpectedID = 0;
  // Exceptionally using a set here in case IDs are messed up.
  DenseSet<const Node *> SeenNodes;
  DenseSet<const Function *> SeenFunctionNodes;
  for (const Node *N : Nodes) {
    if (N->getID() != (ExpectedID++)) {
      errs() << "Node IDs are incorrect!\n";
      return false;
    }

    if (!SeenNodes.insert(N).second) {
      errs() << "Node seen more than once!\n";
      return false;
    }

    if (&getNode(N->getID()) != N) {
      errs() << "getNode doesn't return the right node\n";
      return false;
    }

    for (const Edge *E : N->IncomingEdges) {
      if (!E->Src || !E->Dst || (E->Dst != N) ||
          (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
        errs() << "ill-formed incoming edges\n";
        return false;
      }
    }

    for (const Edge *E : N->OutgoingEdges) {
      if (!E->Src || !E->Dst || (E->Src != N) ||
          (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
        errs() << "ill-formed outgoing edges\n";
        return false;
      }
    }

    const Function &Fn = N->getFunction();
    if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
      if (N->hasAnyIncomingEdges()) {
        errs() << "Kernels cannot have incoming edges\n";
        return false;
      }
    }

    if (Fn.isDeclaration()) {
      errs() << "declarations shouldn't have nodes!\n";
      return false;
    }

    auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
    if (!Inserted) {
      errs() << "one function has multiple nodes!\n";
      return false;
    }
  }

  if (ExpectedID != Nodes.size()) {
    errs() << "Node IDs out of sync!\n";
    return false;
  }

  if (createNodesBitVector().size() != getNumNodes()) {
    errs() << "nodes bit vector doesn't have the right size!\n";
    return false;
  }

  // Check we respect the promise of Node::isKernel
  BitVector BV = createNodesBitVector();
  for (const Node *N : nodes()) {
    if (N->isGraphEntryPoint())
      N->getDependencies(BV);
  }

  // Ensure each function in the module has an associated node.
  for (const auto &Fn : M) {
    if (!Fn.isDeclaration()) {
      if (!SeenFunctionNodes.contains(&Fn)) {
        errs() << "Fn has no associated node in the graph!\n";
        return false;
      }
    }
  }

  if (!BV.all()) {
    errs() << "not all nodes are reachable through the graph's entry points!\n";
    return false;
  }

  return true;
}
#endif

CostType SplitGraph::calculateCost(const BitVector &BV) const { … }

SplitGraph::Node &
SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
                    const GlobalValue &GV) { … }

const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
                                               EdgeKind EK) { … }

//===----------------------------------------------------------------------===//
// Split Proposals
//===----------------------------------------------------------------------===//

/// Represents a module splitting proposal.
///
/// Proposals are made of N BitVectors, one for each partition, where each bit
/// set indicates that the node is present and should be copied inside that
/// partition.
///
/// Proposals have several metrics attached so they can be compared/sorted,
/// which the driver to try multiple strategies resultings in multiple proposals
/// and choose the best one out of them.
class SplitProposal { … };

void SplitProposal::print(raw_ostream &OS) const { … }

unsigned SplitProposal::findCheapestPartition() const { … }

void SplitProposal::calculateScores() { … }

#ifndef NDEBUG
void SplitProposal::verifyCompleteness() const {
  if (Partitions.empty())
    return;

  BitVector Result = Partitions[0].second;
  for (const auto &P : drop_begin(Partitions))
    Result |= P.second;
  assert(Result.all() && "some nodes are missing from this proposal!");
}
#endif

//===-- RecursiveSearchStrategy -------------------------------------------===//

/// Partitioning algorithm.
///
/// This is a recursive search algorithm that can explore multiple possiblities.
///
/// When a cluster of nodes can go into more than one partition, and we haven't
/// reached maximum search depth, we recurse and explore both options and their
/// consequences. Both branches will yield a proposal, and the driver will grade
/// both and choose the best one.
///
/// If max depth is reached, we will use some heuristics to make a choice. Most
/// of the time we will just use the least-pressured (cheapest) partition, but
/// if a cluster is particularly big and there is a good amount of overlap with
/// an existing partition, we will choose that partition instead.
class RecursiveSearchSplitting { … };

RecursiveSearchSplitting::RecursiveSearchSplitting(
    const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
    : … { … }

void RecursiveSearchSplitting::run() { … }

void RecursiveSearchSplitting::setupWorkList() { … }

void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
                                             SplitProposal SP) { … }

std::pair<unsigned, CostType>
RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
                                                   const SplitProposal &SP) { … }

//===----------------------------------------------------------------------===//
// DOTGraph Printing Support
//===----------------------------------------------------------------------===//

const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { … }

SplitGraphEdgeDstIterator;

} // namespace

template <> struct GraphTraits<SplitGraph> { … };

template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits { … };

//===----------------------------------------------------------------------===//
// Driver
//===----------------------------------------------------------------------===//

namespace {

// If we didn't externalize GVs, then local GVs need to be conservatively
// imported into every module (including their initializers), and then cleaned
// up afterwards.
static bool needsConservativeImport(const GlobalValue *GV) { … }

/// Prints a summary of the partition \p N, represented by module \p M, to \p
/// OS.
static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
                                  unsigned PartCost, unsigned ModuleCost) { … }

static void evaluateProposal(SplitProposal &Best, SplitProposal New) { … }

/// Trivial helper to create an identical copy of \p M.
static std::unique_ptr<Module> cloneAll(const Module &M) { … }

/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
static void writeDOTGraph(const SplitGraph &SG) { … }

static void splitAMDGPUModule(
    GetTTIFn GetTTI, Module &M, unsigned NumParts,
    function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { … }
} // namespace

PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                             ModuleAnalysisManager &MAM) { … }
} // namespace llvm
llvm/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp