BlockFrequencyInfoImpl.h | Explore in Territory

//==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Shared implementation of BlockFrequency for IR and Machine Instructions.
// See the documentation below for BlockFrequencyInfoImpl for details.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
#define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H

#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Support/BlockFrequency.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/DOTGraphTraits.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/ScaledNumber.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <iterator>
#include <limits>
#include <list>
#include <optional>
#include <queue>
#include <string>
#include <utility>
#include <vector>

#define DEBUG_TYPE …

namespace llvm {
extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;

extern llvm::cl::opt<bool> UseIterativeBFIInference;
extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock;
extern llvm::cl::opt<double> IterativeBFIPrecision;

class BranchProbabilityInfo;
class Function;
class Loop;
class LoopInfo;
class MachineBasicBlock;
class MachineBranchProbabilityInfo;
class MachineFunction;
class MachineLoop;
class MachineLoopInfo;

namespace bfi_detail {

struct IrreducibleGraph;

// This is part of a workaround for a GCC 4.7 crash on lambdas.
template <class BT> struct BlockEdgesAdder;

/// Mass of a block.
///
/// This class implements a sort of fixed-point fraction always between 0.0 and
/// 1.0.  getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
/// 1.0.
///
/// Masses can be added and subtracted.  Simple saturation arithmetic is used,
/// so arithmetic operations never overflow or underflow.
///
/// Masses can be multiplied.  Multiplication treats full mass as 1.0 and uses
/// an inexpensive floating-point algorithm that's off-by-one (almost, but not
/// quite, maximum precision).
///
/// Masses can be scaled by \a BranchProbability at maximum precision.
class BlockMass { … };

inline BlockMass operator+(BlockMass L, BlockMass R) { … }
inline BlockMass operator-(BlockMass L, BlockMass R) { … }
inline BlockMass operator*(BlockMass L, BranchProbability R) { … }
inline BlockMass operator*(BranchProbability L, BlockMass R) { … }

inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) { … }

} // end namespace bfi_detail

/// Base class for BlockFrequencyInfoImpl
///
/// BlockFrequencyInfoImplBase has supporting data structures and some
/// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
/// the block type (or that call such algorithms) are skipped here.
///
/// Nevertheless, the majority of the overall algorithm documentation lives with
/// BlockFrequencyInfoImpl.  See there for details.
class BlockFrequencyInfoImplBase { … };

namespace bfi_detail {

template <class BlockT> struct TypeMap { … };
template <> struct TypeMap<BasicBlock> { … };
template <> struct TypeMap<MachineBasicBlock> { … };

template <class BlockT, class BFIImplT>
class BFICallbackVH;

/// Get the name of a MachineBasicBlock.
///
/// Get the name of a MachineBasicBlock.  It's templated so that including from
/// CodeGen is unnecessary (that would be a layering issue).
///
/// This is used mainly for debug output.  The name is similar to
/// MachineBasicBlock::getFullName(), but skips the name of the function.
template <class BlockT> std::string getBlockName(const BlockT *BB) { … }
/// Get the name of a BasicBlock.
template <> inline std::string getBlockName(const BasicBlock *BB) { … }

/// Graph of irreducible control flow.
///
/// This graph is used for determining the SCCs in a loop (or top-level
/// function) that has irreducible control flow.
///
/// During the block frequency algorithm, the local graphs are defined in a
/// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
/// graphs for most edges, but getting others from \a LoopData::ExitMap.  The
/// latter only has successor information.
///
/// \a IrreducibleGraph makes this graph explicit.  It's in a form that can use
/// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
/// and it explicitly lists predecessors and successors.  The initialization
/// that relies on \c MachineBasicBlock is defined in the header.
struct IrreducibleGraph { … };

template <class BlockEdgesAdder>
void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
                                  BlockEdgesAdder addBlockEdges) { … }

template <class BlockEdgesAdder>
void IrreducibleGraph::addEdges(const BlockNode &Node,
                                const BFIBase::LoopData *OuterLoop,
                                BlockEdgesAdder addBlockEdges) { … }

} // end namespace bfi_detail

/// Shared implementation for block frequency analysis.
///
/// This is a shared implementation of BlockFrequencyInfo and
/// MachineBlockFrequencyInfo, and calculates the relative frequencies of
/// blocks.
///
/// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
/// which is called the header.  A given loop, L, can have sub-loops, which are
/// loops within the subgraph of L that exclude its header.  (A "trivial" SCC
/// consists of a single block that does not have a self-edge.)
///
/// In addition to loops, this algorithm has limited support for irreducible
/// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
/// discovered on the fly, and modelled as loops with multiple headers.
///
/// The headers of irreducible sub-SCCs consist of its entry blocks and all
/// nodes that are targets of a backedge within it (excluding backedges within
/// true sub-loops).  Block frequency calculations act as if a block is
/// inserted that intercepts all the edges to the headers.  All backedges and
/// entries point to this block.  Its successors are the headers, which split
/// the frequency evenly.
///
/// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
/// separates mass distribution from loop scaling, and dithers to eliminate
/// probability mass loss.
///
/// The implementation is split between BlockFrequencyInfoImpl, which knows the
/// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
/// BlockFrequencyInfoImplBase, which doesn't.  The base class uses \a
/// BlockNode, a wrapper around a uint32_t.  BlockNode is numbered from 0 in
/// reverse-post order.  This gives two advantages:  it's easy to compare the
/// relative ordering of two nodes, and maps keyed on BlockT can be represented
/// by vectors.
///
/// This algorithm is O(V+E), unless there is irreducible control flow, in
/// which case it's O(V*E) in the worst case.
///
/// These are the main stages:
///
///  0. Reverse post-order traversal (\a initializeRPOT()).
///
///     Run a single post-order traversal and save it (in reverse) in RPOT.
///     All other stages make use of this ordering.  Save a lookup from BlockT
///     to BlockNode (the index into RPOT) in Nodes.
///
///  1. Loop initialization (\a initializeLoops()).
///
///     Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
///     the algorithm.  In particular, store the immediate members of each loop
///     in reverse post-order.
///
///  2. Calculate mass and scale in loops (\a computeMassInLoops()).
///
///     For each loop (bottom-up), distribute mass through the DAG resulting
///     from ignoring backedges and treating sub-loops as a single pseudo-node.
///     Track the backedge mass distributed to the loop header, and use it to
///     calculate the loop scale (number of loop iterations).  Immediate
///     members that represent sub-loops will already have been visited and
///     packaged into a pseudo-node.
///
///     Distributing mass in a loop is a reverse-post-order traversal through
///     the loop.  Start by assigning full mass to the Loop header.  For each
///     node in the loop:
///
///         - Fetch and categorize the weight distribution for its successors.
///           If this is a packaged-subloop, the weight distribution is stored
///           in \a LoopData::Exits.  Otherwise, fetch it from
///           BranchProbabilityInfo.
///
///         - Each successor is categorized as \a Weight::Local, a local edge
///           within the current loop, \a Weight::Backedge, a backedge to the
///           loop header, or \a Weight::Exit, any successor outside the loop.
///           The weight, the successor, and its category are stored in \a
///           Distribution.  There can be multiple edges to each successor.
///
///         - If there's a backedge to a non-header, there's an irreducible SCC.
///           The usual flow is temporarily aborted.  \a
///           computeIrreducibleMass() finds the irreducible SCCs within the
///           loop, packages them up, and restarts the flow.
///
///         - Normalize the distribution:  scale weights down so that their sum
///           is 32-bits, and coalesce multiple edges to the same node.
///
///         - Distribute the mass accordingly, dithering to minimize mass loss,
///           as described in \a distributeMass().
///
///     In the case of irreducible loops, instead of a single loop header,
///     there will be several. The computation of backedge masses is similar
///     but instead of having a single backedge mass, there will be one
///     backedge per loop header. In these cases, each backedge will carry
///     a mass proportional to the edge weights along the corresponding
///     path.
///
///     At the end of propagation, the full mass assigned to the loop will be
///     distributed among the loop headers proportionally according to the
///     mass flowing through their backedges.
///
///     Finally, calculate the loop scale from the accumulated backedge mass.
///
///  3. Distribute mass in the function (\a computeMassInFunction()).
///
///     Finally, distribute mass through the DAG resulting from packaging all
///     loops in the function.  This uses the same algorithm as distributing
///     mass in a loop, except that there are no exit or backedge edges.
///
///  4. Unpackage loops (\a unwrapLoops()).
///
///     Initialize each block's frequency to a floating point representation of
///     its mass.
///
///     Visit loops top-down, scaling the frequencies of its immediate members
///     by the loop's pseudo-node's frequency.
///
///  5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
///
///     Using the min and max frequencies as a guide, translate floating point
///     frequencies to an appropriate range in uint64_t.
///
/// It has some known flaws.
///
///   - The model of irreducible control flow is a rough approximation.
///
///     Modelling irreducible control flow exactly involves setting up and
///     solving a group of infinite geometric series.  Such precision is
///     unlikely to be worthwhile, since most of our algorithms give up on
///     irreducible control flow anyway.
///
///     Nevertheless, we might find that we need to get closer.  Here's a sort
///     of TODO list for the model with diminishing returns, to be completed as
///     necessary.
///
///       - The headers for the \a LoopData representing an irreducible SCC
///         include non-entry blocks.  When these extra blocks exist, they
///         indicate a self-contained irreducible sub-SCC.  We could treat them
///         as sub-loops, rather than arbitrarily shoving the problematic
///         blocks into the headers of the main irreducible SCC.
///
///       - Entry frequencies are assumed to be evenly split between the
///         headers of a given irreducible SCC, which is the only option if we
///         need to compute mass in the SCC before its parent loop.  Instead,
///         we could partially compute mass in the parent loop, and stop when
///         we get to the SCC.  Here, we have the correct ratio of entry
///         masses, which we can use to adjust their relative frequencies.
///         Compute mass in the SCC, and then continue propagation in the
///         parent.
///
///       - We can propagate mass iteratively through the SCC, for some fixed
///         number of iterations.  Each iteration starts by assigning the entry
///         blocks their backedge mass from the prior iteration.  The final
///         mass for each block (and each exit, and the total backedge mass
///         used for computing loop scale) is the sum of all iterations.
///         (Running this until fixed point would "solve" the geometric
///         series by simulation.)
template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase { … };

namespace bfi_detail {

BFICallbackVH<BasicBlock, BFIImplT>;

/// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles
/// don't apply to them.
BFICallbackVH<MachineBasicBlock, BFIImplT>;

} // end namespace bfi_detail

template <class BT>
void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
                                           const BranchProbabilityInfoT &BPI,
                                           const LoopInfoT &LI) { … }

template <class BT>
void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB,
                                              BlockFrequency Freq) { … }

template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() { … }

template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() { … }

template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() { … }

template <class BT>
bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) { … }

template <class BT>
bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() { … }

template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() { … }

template <class BT>
bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const { … }

template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() { … }

template <class BT>
void BlockFrequencyInfoImpl<BT>::iterativeInference(
    const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const { … }

template <class BT>
void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
    std::vector<const BlockT *> &Blocks) const { … }

template <class BT>
void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
    const std::vector<const BlockT *> &Blocks,
    const DenseMap<const BlockT *, size_t> &BlockIndex,
    ProbMatrixType &ProbMatrix) const { … }

#ifndef NDEBUG
template <class BT>
BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy(
    const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const {
  assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block");
  Scaled64 Discrepancy;
  for (size_t I = 0; I < ProbMatrix.size(); I++) {
    Scaled64 Sum;
    for (const auto &Jump : ProbMatrix[I]) {
      Sum += Freq[Jump.first] * Jump.second;
    }
    Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I];
  }
  // Normalizing by the frequency of the entry block
  return Discrepancy / Freq[0];
}
#endif

/// \note This should be a lambda, but that crashes GCC 4.7.
namespace bfi_detail {

template <class BT> struct BlockEdgesAdder { … };

} // end namespace bfi_detail

template <class BT>
void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
    LoopData *OuterLoop, std::list<LoopData>::iterator Insert) { … }

// A helper function that converts a branch probability into weight.
inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) { … }

template <class BT>
bool
BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
                                                      const BlockNode &Node) { … }

template <class BT>
raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const { … }

template <class BT>
void BlockFrequencyInfoImpl<BT>::verifyMatch(
    BlockFrequencyInfoImpl<BT> &Other) const { … }

// Graph trait base class for block frequency information graph
// viewer.

enum GVDAGType { … };

template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits { … };

} // end namespace llvm

#undef DEBUG_TYPE

#endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
llvm/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h