#include "AMDGPUSplitModule.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Timer.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iterator>
#include <memory>
#include <utility>
#include <vector>
#ifndef NDEBUG
#include "llvm/Support/LockFileManager.h"
#endif
#define DEBUG_TYPE …
namespace llvm {
namespace {
static cl::opt<unsigned> MaxDepth(
"amdgpu-module-splitting-max-depth",
cl::desc(
"maximum search depth. 0 forces a greedy approach. "
"warning: the algorithm is up to O(2^N), where N is the max depth."),
cl::init(8));
static cl::opt<float> LargeFnFactor(
"amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
cl::desc(
"when max depth is reached and we can no longer branch out, this "
"value determines if a function is worth merging into an already "
"existing partition to reduce code duplication. This is a factor "
"of the ideal partition size, e.g. 2.0 means we consider the "
"function for merging if its cost (including its callees) is 2x the "
"size of an ideal partition."));
static cl::opt<float> LargeFnOverlapForMerge(
"amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
cl::desc("when a function is considered for merging into a partition that "
"already contains some of its callees, do the merge if at least "
"n% of the code it can reach is already present inside the "
"partition; e.g. 0.7 means only merge >70%"));
static cl::opt<bool> NoExternalizeGlobals(
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
cl::desc("disables externalization of global variable with local linkage; "
"may cause globals to be duplicated which increases binary size"));
static cl::opt<std::string>
ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
cl::Hidden,
cl::desc("output file to write out the dotgraph "
"representation of the input module"));
static cl::opt<std::string> PartitionSummariesOutput(
"amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
cl::desc("output file to write out a summary of "
"the partitions created for each module"));
#ifndef NDEBUG
static cl::opt<bool>
UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
cl::desc("use a lock file so only one process in the system "
"can run this pass at once. useful to avoid mangled "
"debug output in multithreaded environments."));
static cl::opt<bool>
DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
cl::Hidden,
cl::desc("print all proposals received and whether "
"they were rejected or accepted"));
#endif
struct SplitModuleTimer : NamedRegionTimer { … };
CostType;
FunctionsCostMap;
GetTTIFn;
static constexpr unsigned InvalidPID = …;
static auto formatRatioOf(CostType Num, CostType Dem) { … }
static bool isNonCopyable(const Function &F) { … }
static void externalize(GlobalValue &GV) { … }
static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
FunctionsCostMap &CostMap) { … }
static bool canBeIndirectlyCalled(const Function &F) { … }
class SplitGraph { … };
class SplitGraph::Node { … };
void SplitGraph::Node::visitAllDependencies(
std::function<void(const Node &)> Visitor) const { … }
void SplitGraph::buildGraph(CallGraph &CG) { … }
#ifndef NDEBUG
bool SplitGraph::verifyGraph() const {
unsigned ExpectedID = 0;
DenseSet<const Node *> SeenNodes;
DenseSet<const Function *> SeenFunctionNodes;
for (const Node *N : Nodes) {
if (N->getID() != (ExpectedID++)) {
errs() << "Node IDs are incorrect!\n";
return false;
}
if (!SeenNodes.insert(N).second) {
errs() << "Node seen more than once!\n";
return false;
}
if (&getNode(N->getID()) != N) {
errs() << "getNode doesn't return the right node\n";
return false;
}
for (const Edge *E : N->IncomingEdges) {
if (!E->Src || !E->Dst || (E->Dst != N) ||
(find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
errs() << "ill-formed incoming edges\n";
return false;
}
}
for (const Edge *E : N->OutgoingEdges) {
if (!E->Src || !E->Dst || (E->Src != N) ||
(find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
errs() << "ill-formed outgoing edges\n";
return false;
}
}
const Function &Fn = N->getFunction();
if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
if (N->hasAnyIncomingEdges()) {
errs() << "Kernels cannot have incoming edges\n";
return false;
}
}
if (Fn.isDeclaration()) {
errs() << "declarations shouldn't have nodes!\n";
return false;
}
auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
if (!Inserted) {
errs() << "one function has multiple nodes!\n";
return false;
}
}
if (ExpectedID != Nodes.size()) {
errs() << "Node IDs out of sync!\n";
return false;
}
if (createNodesBitVector().size() != getNumNodes()) {
errs() << "nodes bit vector doesn't have the right size!\n";
return false;
}
BitVector BV = createNodesBitVector();
for (const Node *N : nodes()) {
if (N->isGraphEntryPoint())
N->getDependencies(BV);
}
for (const auto &Fn : M) {
if (!Fn.isDeclaration()) {
if (!SeenFunctionNodes.contains(&Fn)) {
errs() << "Fn has no associated node in the graph!\n";
return false;
}
}
}
if (!BV.all()) {
errs() << "not all nodes are reachable through the graph's entry points!\n";
return false;
}
return true;
}
#endif
CostType SplitGraph::calculateCost(const BitVector &BV) const { … }
SplitGraph::Node &
SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
const GlobalValue &GV) { … }
const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
EdgeKind EK) { … }
class SplitProposal { … };
void SplitProposal::print(raw_ostream &OS) const { … }
unsigned SplitProposal::findCheapestPartition() const { … }
void SplitProposal::calculateScores() { … }
#ifndef NDEBUG
void SplitProposal::verifyCompleteness() const {
if (Partitions.empty())
return;
BitVector Result = Partitions[0].second;
for (const auto &P : drop_begin(Partitions))
Result |= P.second;
assert(Result.all() && "some nodes are missing from this proposal!");
}
#endif
class RecursiveSearchSplitting { … };
RecursiveSearchSplitting::RecursiveSearchSplitting(
const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
: … { … }
void RecursiveSearchSplitting::run() { … }
void RecursiveSearchSplitting::setupWorkList() { … }
void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
SplitProposal SP) { … }
std::pair<unsigned, CostType>
RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
const SplitProposal &SP) { … }
const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) { … }
SplitGraphEdgeDstIterator;
}
template <> struct GraphTraits<SplitGraph> { … };
template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits { … };
namespace {
static bool needsConservativeImport(const GlobalValue *GV) { … }
static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
unsigned PartCost, unsigned ModuleCost) { … }
static void evaluateProposal(SplitProposal &Best, SplitProposal New) { … }
static std::unique_ptr<Module> cloneAll(const Module &M) { … }
static void writeDOTGraph(const SplitGraph &SG) { … }
static void splitAMDGPUModule(
GetTTIFn GetTTI, Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) { … }
}
PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
ModuleAnalysisManager &MAM) { … }
}