//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements basic block placement transformations using the CFG // structure and branch probability estimates. // // The pass strives to preserve the structure of the CFG (that is, retain // a topological ordering of basic blocks) in the absence of a *strong* signal // to the contrary from probabilities. However, within the CFG structure, it // attempts to choose an ordering which favors placing more likely sequences of // blocks adjacent to each other. // // The algorithm works from the inner-most loop within a function outward, and // at each stage walks through the basic blocks, trying to coalesce them into // sequential chains where allowed by the CFG (or demanded by heavy // probabilities). Finally, it walks the blocks in topological order, and the // first time it reaches a chain of basic blocks, it schedules them in the // function in-order. // //===----------------------------------------------------------------------===// #include "BranchFolding.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MBFIWrapper.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/IR/PrintPasses.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/CodeLayout.h" #include <algorithm> #include <cassert> #include <cstdint> #include <iterator> #include <memory> #include <string> #include <tuple> #include <utility> #include <vector> usingnamespacellvm; #define DEBUG_TYPE … STATISTIC(NumCondBranches, "Number of conditional branches"); STATISTIC(NumUncondBranches, "Number of unconditional branches"); STATISTIC(CondBranchTakenFreq, "Potential frequency of taking conditional branches"); STATISTIC(UncondBranchTakenFreq, "Potential frequency of taking unconditional branches"); static cl::opt<unsigned> AlignAllBlock( "align-all-blocks", cl::desc("Force the alignment of all blocks in the function in log2 format " "(e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden); static cl::opt<unsigned> AlignAllNonFallThruBlocks( "align-all-nofallthru-blocks", cl::desc("Force the alignment of all blocks that have no fall-through " "predecessors (i.e. don't add nops that are executed). In log2 " "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden); static cl::opt<unsigned> MaxBytesForAlignmentOverride( "max-bytes-for-alignment", cl::desc("Forces the maximum bytes allowed to be emitted when padding for " "alignment"), cl::init(0), cl::Hidden); // FIXME: Find a good default for this flag and remove the flag. static cl::opt<unsigned> ExitBlockBias( "block-placement-exit-block-bias", cl::desc("Block frequency percentage a loop exit block needs " "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden); // Definition: // - Outlining: placement of a basic block outside the chain or hot path. static cl::opt<unsigned> LoopToColdBlockRatio( "loop-to-cold-block-ratio", cl::desc("Outline loop blocks from loop chain if (frequency of loop) / " "(frequency of block) is greater than this ratio"), cl::init(5), cl::Hidden); static cl::opt<bool> ForceLoopColdBlock("force-loop-cold-block", cl::desc("Force outlining cold blocks from loops."), cl::init(false), cl::Hidden); static cl::opt<bool> PreciseRotationCost("precise-rotation-cost", cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden); static cl::opt<bool> ForcePreciseRotationCost("force-precise-rotation-cost", cl::desc("Force the use of precise cost " "loop rotation strategy."), cl::init(false), cl::Hidden); static cl::opt<unsigned> MisfetchCost( "misfetch-cost", cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden); static cl::opt<unsigned> JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); static cl::opt<bool> TailDupPlacement("tail-dup-placement", cl::desc("Perform tail duplication during placement. " "Creates more fallthrough opportunites in " "outline branches."), cl::init(true), cl::Hidden); static cl::opt<bool> BranchFoldPlacement("branch-fold-placement", cl::desc("Perform branch folding during placement. " "Reduces code size."), cl::init(true), cl::Hidden); // Heuristic for tail duplication. static cl::opt<unsigned> TailDupPlacementThreshold( "tail-dup-placement-threshold", cl::desc("Instruction cutoff for tail duplication during layout. " "Tail merging during layout is forced to have a threshold " "that won't conflict."), cl::init(2), cl::Hidden); // Heuristic for aggressive tail duplication. static cl::opt<unsigned> TailDupPlacementAggressiveThreshold( "tail-dup-placement-aggressive-threshold", cl::desc("Instruction cutoff for aggressive tail duplication during " "layout. Used at -O3. Tail merging during layout is forced to " "have a threshold that won't conflict."), cl::init(4), cl::Hidden); // Heuristic for tail duplication. static cl::opt<unsigned> TailDupPlacementPenalty( "tail-dup-placement-penalty", cl::desc( "Cost penalty for blocks that can avoid breaking CFG by copying. " "Copying can increase fallthrough, but it also increases icache " "pressure. This parameter controls the penalty to account for that. " "Percent as integer."), cl::init(2), cl::Hidden); // Heuristic for tail duplication if profile count is used in cost model. static cl::opt<unsigned> TailDupProfilePercentThreshold( "tail-dup-profile-percent-threshold", cl::desc("If profile count information is used in tail duplication cost " "model, the gained fall through number from tail duplication " "should be at least this percent of hot count."), cl::init(50), cl::Hidden); // Heuristic for triangle chains. static cl::opt<unsigned> TriangleChainCount( "triangle-chain-count", cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the " "triangle tail duplication heuristic to kick in. 0 to disable."), cl::init(2), cl::Hidden); // Use case: When block layout is visualized after MBP pass, the basic blocks // are labeled in layout order; meanwhile blocks could be numbered in a // different order. It's hard to map between the graph and pass output. // With this option on, the basic blocks are renumbered in function layout // order. For debugging only. static cl::opt<bool> RenumberBlocksBeforeView( "renumber-blocks-before-view", cl::desc( "If true, basic blocks are re-numbered before MBP layout is printed " "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden); static cl::opt<unsigned> ExtTspBlockPlacementMaxBlocks( "ext-tsp-block-placement-max-blocks", cl::desc("Maximum number of basic blocks in a function to run ext-TSP " "block placement."), cl::init(UINT_MAX), cl::Hidden); // Apply the ext-tsp algorithm minimizing the size of a binary. static cl::opt<bool> ApplyExtTspForSize("apply-ext-tsp-for-size", cl::init(false), cl::Hidden, cl::desc("Use ext-tsp for size-aware block placement.")); namespace llvm { extern cl::opt<bool> EnableExtTspBlockPlacement; extern cl::opt<bool> ApplyExtTspWithoutProfile; extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; // Internal option used to control BFI display only after MBP pass. // Defined in CodeGen/MachineBlockFrequencyInfo.cpp: // -view-block-layout-with-bfi= extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; // Command line option to specify the name of the function for CFG dump // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; } // namespace llvm namespace { class BlockChain; /// Type for our function-wide basic block -> block chain mapping. BlockToChainMapType; /// A chain of blocks which will be laid out contiguously. /// /// This is the datastructure representing a chain of consecutive blocks that /// are profitable to layout together in order to maximize fallthrough /// probabilities and code locality. We also can use a block chain to represent /// a sequence of basic blocks which have some external (correctness) /// requirement for sequential layout. /// /// Chains can be built around a single basic block and can be merged to grow /// them. They participate in a block-to-chain mapping, which is updated /// automatically as chains are merged together. class BlockChain { … }; class MachineBlockPlacement : public MachineFunctionPass { … }; } // end anonymous namespace char MachineBlockPlacement::ID = …; char &llvm::MachineBlockPlacementID = …; INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE, "Branch Probability Basic Block Placement", false, false) #ifndef NDEBUG /// Helper to print the name of a MBB. /// /// Only used by debug logging. static std::string getBlockName(const MachineBasicBlock *BB) { std::string Result; raw_string_ostream OS(Result); OS << printMBBReference(*BB); OS << " ('" << BB->getName() << "')"; OS.flush(); return Result; } #endif /// Mark a chain's successors as having one fewer preds. /// /// When a chain is being merged into the "placed" chain, this routine will /// quickly walk the successors of each block in the chain and mark them as /// having one fewer active predecessor. It also adds any successors of this /// chain which reach the zero-predecessor state to the appropriate worklist. void MachineBlockPlacement::markChainSuccessors( const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { … } /// Mark a single block's successors as having one fewer preds. /// /// Under normal circumstances, this is only called by markChainSuccessors, /// but if a block that was to be placed is completely tail-duplicated away, /// and was duplicated into the chain end, we need to redo markBlockSuccessors /// for just that block. void MachineBlockPlacement::markBlockSuccessors( const BlockChain &Chain, const MachineBasicBlock *MBB, const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) { … } /// This helper function collects the set of successors of block /// \p BB that are allowed to be its layout successors, and return /// the total branch probability of edges from \p BB to those /// blocks. BranchProbability MachineBlockPlacement::collectViableSuccessors( const MachineBasicBlock *BB, const BlockChain &Chain, const BlockFilterSet *BlockFilter, SmallVector<MachineBasicBlock *, 4> &Successors) { … } /// The helper function returns the branch probability that is adjusted /// or normalized over the new total \p AdjustedSumProb. static BranchProbability getAdjustedProbability(BranchProbability OrigProb, BranchProbability AdjustedSumProb) { … } /// Check if \p BB has exactly the successors in \p Successors. static bool hasSameSuccessors(MachineBasicBlock &BB, SmallPtrSetImpl<const MachineBasicBlock *> &Successors) { … } /// Check if a block should be tail duplicated to increase fallthrough /// opportunities. /// \p BB Block to check. bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) { … } /// Compare 2 BlockFrequency's with a small penalty for \p A. /// In order to be conservative, we apply a X% penalty to account for /// increased icache pressure and static heuristics. For small frequencies /// we use only the numerators to improve accuracy. For simplicity, we assume /// the penalty is less than 100% /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere. static bool greaterWithBias(BlockFrequency A, BlockFrequency B, BlockFrequency EntryFreq) { … } /// Check the edge frequencies to see if tail duplication will increase /// fallthroughs. It only makes sense to call this function when /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is /// always locally profitable if we would have picked \p Succ without /// considering duplication. bool MachineBlockPlacement::isProfitableToTailDup( const MachineBasicBlock *BB, const MachineBasicBlock *Succ, BranchProbability QProb, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// Check for a trellis layout. \p BB is the upper part of a trellis if its /// successors form the lower part of a trellis. A successor set S forms the /// lower part of a trellis if all of the predecessors of S are either in S or /// have all of S as successors. We ignore trellises where BB doesn't have 2 /// successors because for fewer than 2, it's trivial, and for 3 or greater they /// are very uncommon and complex to compute optimally. Allowing edges within S /// is not strictly a trellis, but the same algorithm works, so we allow it. bool MachineBlockPlacement::isTrellis( const MachineBasicBlock *BB, const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// Pick the highest total weight pair of edges that can both be laid out. /// The edges in \p Edges[0] are assumed to have a different destination than /// the edges in \p Edges[1]. Simple counting shows that the best pair is either /// the individual highest weight edges to the 2 different destinations, or in /// case of a conflict, one of them should be replaced with a 2nd best edge. std::pair<MachineBlockPlacement::WeightedEdge, MachineBlockPlacement::WeightedEdge> MachineBlockPlacement::getBestNonConflictingEdges( const MachineBasicBlock *BB, MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>> Edges) { … } /// Get the best successor from \p BB based on \p BB being part of a trellis. /// We only handle trellises with 2 successors, so the algorithm is /// straightforward: Find the best pair of edges that don't conflict. We find /// the best incoming edge for each successor in the trellis. If those conflict, /// we consider which of them should be replaced with the second best. /// Upon return the two best edges will be in \p BestEdges. If one of the edges /// comes from \p BB, it will be in \p BestEdges[0] MachineBlockPlacement::BlockAndTailDupResult MachineBlockPlacement::getBestTrellisSuccessor( const MachineBasicBlock *BB, const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs, BranchProbability AdjustedSumProb, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// When the option allowTailDupPlacement() is on, this method checks if the /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated /// into all of its unplaced, unfiltered predecessors, that are not BB. bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( const MachineBasicBlock *BB, MachineBasicBlock *Succ, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// Find chains of triangles where we believe it would be profitable to /// tail-duplicate them all, but a local analysis would not find them. /// There are 3 ways this can be profitable: /// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with /// longer chains) /// 2) The chains are statically correlated. Branch probabilities have a very /// U-shaped distribution. /// [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805] /// If the branches in a chain are likely to be from the same side of the /// distribution as their predecessor, but are independent at runtime, this /// transformation is profitable. (Because the cost of being wrong is a small /// fixed cost, unlike the standard triangle layout where the cost of being /// wrong scales with the # of triangles.) /// 3) The chains are dynamically correlated. If the probability that a previous /// branch was taken positively influences whether the next branch will be /// taken /// We believe that 2 and 3 are common enough to justify the small margin in 1. void MachineBlockPlacement::precomputeTriangleChains() { … } // When profile is not present, return the StaticLikelyProb. // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) { … } /// Checks to see if the layout candidate block \p Succ has a better layout /// predecessor than \c BB. If yes, returns true. /// \p SuccProb: The probability adjusted for only remaining blocks. /// Only used for logging /// \p RealSuccProb: The un-adjusted probability. /// \p Chain: The chain that BB belongs to and Succ is being considered for. /// \p BlockFilter: if non-null, the set of blocks that make up the loop being /// considered bool MachineBlockPlacement::hasBetterLayoutPredecessor( const MachineBasicBlock *BB, const MachineBasicBlock *Succ, const BlockChain &SuccChain, BranchProbability SuccProb, BranchProbability RealSuccProb, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// Select the best successor for a block. /// /// This looks across all successors of a particular block and attempts to /// select the "best" one to be the layout successor. It only considers direct /// successors which also pass the block filter. It will attempt to avoid /// breaking CFG structure, but cave and break such structures in the case of /// very hot successor edges. /// /// \returns The best successor block found, or null if none are viable, along /// with a boolean indicating if tail duplication is necessary. MachineBlockPlacement::BlockAndTailDupResult MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB, const BlockChain &Chain, const BlockFilterSet *BlockFilter) { … } /// Select the best block from a worklist. /// /// This looks through the provided worklist as a list of candidate basic /// blocks and select the most profitable one to place. The definition of /// profitable only really makes sense in the context of a loop. This returns /// the most frequently visited block in the worklist, which in the case of /// a loop, is the one most desirable to be physically close to the rest of the /// loop body in order to improve i-cache behavior. /// /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { … } /// Retrieve the first unplaced basic block in the entire function. /// /// This routine is called when we are unable to use the CFG to walk through /// all of the basic blocks and form a chain due to unnatural loops in the CFG. /// We walk through the function's blocks in order, starting from the /// LastUnplacedBlockIt. We update this iterator on each call to avoid /// re-scanning the entire sequence on repeated calls to this routine. MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( const BlockChain &PlacedChain, MachineFunction::iterator &PrevUnplacedBlockIt) { … } /// Retrieve the first unplaced basic block among the blocks in BlockFilter. /// /// This is similar to getFirstUnplacedBlock for the entire function, but since /// the size of BlockFilter is typically far less than the number of blocks in /// the entire function, iterating through the BlockFilter is more efficient. /// When processing the entire funciton, using the version without BlockFilter /// has a complexity of #(loops in function) * #(blocks in function), while this /// version has a complexity of sum(#(loops in block) foreach block in function) /// which is always smaller. For long function mostly sequential in structure, /// the complexity is amortized to 1 * #(blocks in function). MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( const BlockChain &PlacedChain, BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, const BlockFilterSet *BlockFilter) { … } void MachineBlockPlacement::fillWorkLists( const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds, const BlockFilterSet *BlockFilter = nullptr) { … } void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB, BlockChain &Chain, BlockFilterSet *BlockFilter) { … } // If bottom of block BB has only one successor OldTop, in most cases it is // profitable to move it before OldTop, except the following case: // // -->OldTop<- // | . | // | . | // | . | // ---Pred | // | | // BB----- // // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't // layout the other successor below it, so it can't reduce taken branch. // In this case we keep its original layout. bool MachineBlockPlacement::canMoveBottomBlockToTop( const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) { … } // Find out the possible fall through frequence to the top of a loop. BlockFrequency MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) { … } // Compute the fall through gains when move NewTop before OldTop. // // In following diagram, edges marked as "-" are reduced fallthrough, edges // marked as "+" are increased fallthrough, this function computes // // SUM(increased fallthrough) - SUM(decreased fallthrough) // // | // | - // V // --->OldTop // | . // | . // +| . + // | Pred ---> // | |- // | V // --- NewTop <--- // |- // V // BlockFrequency MachineBlockPlacement::FallThroughGains( const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop, const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) { … } /// Helper function of findBestLoopTop. Find the best loop top block /// from predecessors of old top. /// /// Look for a block which is strictly better than the old top for laying /// out before the old top of the loop. This looks for only two patterns: /// /// 1. a block has only one successor, the old loop top /// /// Because such a block will always result in an unconditional jump, /// rotating it in front of the old top is always profitable. /// /// 2. a block has two successors, one is old top, another is exit /// and it has more than one predecessors /// /// If it is below one of its predecessors P, only P can fall through to /// it, all other predecessors need a jump to it, and another conditional /// jump to loop header. If it is moved before loop header, all its /// predecessors jump to it, then fall through to loop header. So all its /// predecessors except P can reduce one taken branch. /// At the same time, move it before old top increases the taken branch /// to loop exit block, so the reduced taken branch will be compared with /// the increased taken branch to the loop exit block. MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper( MachineBasicBlock *OldTop, const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { … } /// Find the best loop top block for layout. /// /// This function iteratively calls findBestLoopTopHelper, until no new better /// BB can be found. MachineBasicBlock * MachineBlockPlacement::findBestLoopTop(const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { … } /// Find the best loop exiting block for layout. /// /// This routine implements the logic to analyze the loop looking for the best /// block to layout at the top of the loop. Typically this is done to maximize /// fallthrough opportunities. MachineBasicBlock * MachineBlockPlacement::findBestLoopExit(const MachineLoop &L, const BlockFilterSet &LoopBlockSet, BlockFrequency &ExitFreq) { … } /// Check if there is a fallthrough to loop header Top. /// /// 1. Look for a Pred that can be layout before Top. /// 2. Check if Top is the most possible successor of Pred. bool MachineBlockPlacement::hasViableTopFallthrough( const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) { … } /// Attempt to rotate an exiting block to the bottom of the loop. /// /// Once we have built a chain, try to rotate it to line up the hot exit block /// with fallthrough out of the loop if doing so doesn't introduce unnecessary /// branches. For example, if the loop has fallthrough into its header and out /// of its bottom already, don't rotate it. void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB, BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet) { … } /// Attempt to rotate a loop based on profile data to reduce branch cost. /// /// With profile data, we can determine the cost in terms of missed fall through /// opportunities when rotating a loop chain and select the best rotation. /// Basically, there are three kinds of cost to consider for each rotation: /// 1. The possibly missed fall through edge (if it exists) from BB out of /// the loop to the loop header. /// 2. The possibly missed fall through edges (if they exist) from the loop /// exits to BB out of the loop. /// 3. The missed fall through edge (if it exists) from the last BB to the /// first BB in the loop chain. /// Therefore, the cost for a given rotation is the sum of costs listed above. /// We select the best rotation with the smallest cost. void MachineBlockPlacement::rotateLoopWithProfile( BlockChain &LoopChain, const MachineLoop &L, const BlockFilterSet &LoopBlockSet) { … } /// Collect blocks in the given loop that are to be placed. /// /// When profile data is available, exclude cold blocks from the returned set; /// otherwise, collect all blocks in the loop. MachineBlockPlacement::BlockFilterSet MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { … } /// Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code /// as much as possible. We can then stitch the chains together in a way which /// both preserves the topological structure and minimizes taken conditional /// branches. void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { … } void MachineBlockPlacement::buildCFGChains() { … } void MachineBlockPlacement::optimizeBranches() { … } void MachineBlockPlacement::alignBlocks() { … } /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if /// it was duplicated into its chain predecessor and removed. /// \p BB - Basic block that may be duplicated. /// /// \p LPred - Chosen layout predecessor of \p BB. /// Updated to be the chain end if LPred is removed. /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. /// \p BlockFilter - Set of blocks that belong to the loop being laid out. /// Used to identify which blocks to update predecessor /// counts. /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was /// chosen in the given order due to unnatural CFG /// only needed if \p BB is removed and /// \p PrevUnplacedBlockIt pointed to \p BB. /// @return true if \p BB was removed. bool MachineBlockPlacement::repeatedlyTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *&LPred, const MachineBasicBlock *LoopHeaderBB, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt) { … } /// Tail duplicate \p BB into (some) predecessors if profitable. /// \p BB - Basic block that may be duplicated /// \p LPred - Chosen layout predecessor of \p BB /// \p Chain - Chain to which \p LPred belongs, and \p BB will belong. /// \p BlockFilter - Set of blocks that belong to the loop being laid out. /// Used to identify which blocks to update predecessor /// counts. /// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was /// chosen in the given order due to unnatural CFG /// only needed if \p BB is removed and /// \p PrevUnplacedBlockIt pointed to \p BB. /// \p DuplicatedToLPred - True if the block was duplicated into LPred. /// \return - True if the block was duplicated into all preds and removed. bool MachineBlockPlacement::maybeTailDuplicateBlock( MachineBasicBlock *BB, MachineBasicBlock *LPred, BlockChain &Chain, BlockFilterSet *BlockFilter, MachineFunction::iterator &PrevUnplacedBlockIt, BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt, bool &DuplicatedToLPred) { … } // Count the number of actual machine instructions. static uint64_t countMBBInstruction(MachineBasicBlock *MBB) { … } // The size cost of duplication is the instruction size of the duplicated block. // So we should scale the threshold accordingly. But the instruction size is not // available on all targets, so we use the number of instructions instead. BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) { … } // Returns true if BB is Pred's best successor. bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred, BlockFilterSet *BlockFilter) { … } // Find out the predecessors of BB and BB can be beneficially duplicated into // them. void MachineBlockPlacement::findDuplicateCandidates( SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB, BlockFilterSet *BlockFilter) { … } void MachineBlockPlacement::initTailDupThreshold() { … } bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { … } void MachineBlockPlacement::applyExtTsp(bool OptForSize) { … } void MachineBlockPlacement::assignBlockOrder( const std::vector<const MachineBasicBlock *> &NewBlockOrder) { … } void MachineBlockPlacement::createCFGChainExtTsp() { … } namespace { /// A pass to compute block placement statistics. /// /// A separate pass to compute interesting statistics for evaluating block /// placement. This is separate from the actual placement pass so that they can /// be computed in the absence of any placement transformations or when using /// alternative placement strategies. class MachineBlockPlacementStats : public MachineFunctionPass { … }; } // end anonymous namespace char MachineBlockPlacementStats::ID = …; char &llvm::MachineBlockPlacementStatsID = …; INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats", "Basic Block Placement Stats", false, false) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass) INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats", "Basic Block Placement Stats", false, false) bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) { … }