CodeLayout.cpp | Explore in Territory

//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The file implements "cache-aware" layout algorithms of basic blocks and
// functions in a binary.
//
// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
// optimizing jump locality and thus processor I-cache utilization. This is
// achieved via increasing the number of fall-through jumps and co-locating
// frequently executed nodes together. The name follows the underlying
// optimization problem, Extended-TSP, which is a generalization of classical
// (maximum) Traveling Salesmen Problem.
//
// The algorithm is a greedy heuristic that works with chains (ordered lists)
// of basic blocks. Initially all chains are isolated basic blocks. On every
// iteration, we pick a pair of chains whose merging yields the biggest increase
// in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
// A pair of chains giving the maximum gain is merged into a new chain. The
// procedure stops when there is only one chain left, or when merging does not
// increase ExtTSP. In the latter case, the remaining chains are sorted by
// density in the decreasing order.
//
// An important aspect is the way two chains are merged. Unlike earlier
// algorithms (e.g., based on the approach of Pettis-Hansen), two
// chains, X and Y, are first split into three, X1, X2, and Y. Then we
// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
// This improves the quality of the final result (the search space is larger)
// while keeping the implementation sufficiently fast.
//
// Reference:
//   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
//     IEEE Transactions on Computers, 2020
//     https://arxiv.org/abs/1809.04676
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Utils/CodeLayout.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

#include <cmath>
#include <set>

usingnamespacellvm;
usingnamespacellvm::codelayout;

#define DEBUG_TYPE …

namespace llvm {
cl::opt<bool> EnableExtTspBlockPlacement(
    "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
    cl::desc("Enable machine block placement based on the ext-tsp model, "
             "optimizing I-cache utilization."));

cl::opt<bool> ApplyExtTspWithoutProfile(
    "ext-tsp-apply-without-profile",
    cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
    cl::init(true), cl::Hidden);
} // namespace llvm

// Algorithm-specific params for Ext-TSP. The values are tuned for the best
// performance of large-scale front-end bound binaries.
static cl::opt<double> ForwardWeightCond(
    "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
    cl::desc("The weight of conditional forward jumps for ExtTSP value"));

static cl::opt<double> ForwardWeightUncond(
    "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
    cl::desc("The weight of unconditional forward jumps for ExtTSP value"));

static cl::opt<double> BackwardWeightCond(
    "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
    cl::desc("The weight of conditional backward jumps for ExtTSP value"));

static cl::opt<double> BackwardWeightUncond(
    "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
    cl::desc("The weight of unconditional backward jumps for ExtTSP value"));

static cl::opt<double> FallthroughWeightCond(
    "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
    cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));

static cl::opt<double> FallthroughWeightUncond(
    "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
    cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));

static cl::opt<unsigned> ForwardDistance(
    "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
    cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));

static cl::opt<unsigned> BackwardDistance(
    "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
    cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));

// The maximum size of a chain created by the algorithm. The size is bounded
// so that the algorithm can efficiently process extremely large instances.
static cl::opt<unsigned>
    MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
                 cl::desc("The maximum size of a chain to create"));

// The maximum size of a chain for splitting. Larger values of the threshold
// may yield better quality at the cost of worsen run-time.
static cl::opt<unsigned> ChainSplitThreshold(
    "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
    cl::desc("The maximum size of a chain to apply splitting"));

// The maximum ratio between densities of two chains for merging.
static cl::opt<double> MaxMergeDensityRatio(
    "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
    cl::desc("The maximum ratio between densities of two chains for merging"));

// Algorithm-specific options for CDSort.
static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,
                                      cl::desc("The size of the cache"));

static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,
                                   cl::desc("The size of a line in the cache"));

static cl::opt<unsigned>
    CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,
                   cl::desc("The maximum size of a chain to create"));

static cl::opt<double> DistancePower(
    "cdsort-distance-power", cl::ReallyHidden,
    cl::desc("The power exponent for the distance-based locality"));

static cl::opt<double> FrequencyScale(
    "cdsort-frequency-scale", cl::ReallyHidden,
    cl::desc("The scale factor for the frequency-based locality"));

namespace {

// Epsilon for comparison of doubles.
constexpr double EPS = …;

// Compute the Ext-TSP score for a given jump.
double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
                       double Weight) { … }

// Compute the Ext-TSP score for a jump between a given pair of blocks,
// using their sizes, (estimated) addresses and the jump execution count.
double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
                   uint64_t Count, bool IsConditional) { … }

/// A type of merging two chains, X and Y. The former chain is split into
/// X1 and X2 and then concatenated with Y in the order specified by the type.
enum class MergeTypeT : int { … };

/// The gain of merging two chains, that is, the Ext-TSP score of the merge
/// together with the corresponding merge 'type' and 'offset'.
struct MergeGainT { … };

struct JumpT;
struct ChainT;
struct ChainEdge;

/// A node in the graph, typically corresponding to a basic block in the CFG or
/// a function in the call graph.
struct NodeT { … };

/// An arc in the graph, typically corresponding to a jump between two nodes.
struct JumpT { … };

/// A chain (ordered sequence) of nodes in the graph.
struct ChainT { … };

/// An edge in the graph representing jumps between two chains.
/// When nodes are merged into chains, the edges are combined too so that
/// there is always at most one edge between a pair of chains.
struct ChainEdge { … };

bool NodeT::isSuccessor(const NodeT *Other) const { … }

uint64_t NodeT::outCount() const { … }

uint64_t NodeT::inCount() const { … }

void ChainT::mergeEdges(ChainT *Other) { … }

NodeIter;
static std::vector<NodeT *> EmptyList;

/// A wrapper around three concatenated vectors (chains) of nodes; it is used
/// to avoid extra instantiation of the vectors.
struct MergedNodesT { … };

/// A wrapper around two concatenated vectors (chains) of jumps.
struct MergedJumpsT { … };

/// Merge two chains of nodes respecting a given 'type' and 'offset'.
///
/// If MergeType == 0, then the result is a concatenation of two chains.
/// Otherwise, the first chain is cut into two sub-chains at the offset,
/// and merged using all possible ways of concatenating three chains.
MergedNodesT mergeNodes(const std::vector<NodeT *> &X,
                        const std::vector<NodeT *> &Y, size_t MergeOffset,
                        MergeTypeT MergeType) { … }

/// The implementation of the ExtTSP algorithm.
class ExtTSPImpl { … };

/// The implementation of the Cache-Directed Sort (CDSort) algorithm for
/// ordering functions represented by a call graph.
class CDSortImpl { … };

} // end of anonymous namespace

std::vector<uint64_t>
codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
                                ArrayRef<uint64_t> NodeCounts,
                                ArrayRef<EdgeCount> EdgeCounts) { … }

double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
                                   ArrayRef<uint64_t> NodeSizes,
                                   ArrayRef<uint64_t> NodeCounts,
                                   ArrayRef<EdgeCount> EdgeCounts) { … }

double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
                                   ArrayRef<uint64_t> NodeCounts,
                                   ArrayRef<EdgeCount> EdgeCounts) { … }

std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
    ArrayRef<uint64_t> CallOffsets) { … }

std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) { … }
llvm/llvm/lib/Transforms/Utils/CodeLayout.cpp