//===- CallGraphSort.cpp --------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// /// The file is responsible for sorting sections using LLVM call graph profile /// data by placing frequently executed code sections together. The goal of the /// placement is to improve the runtime performance of the final executable by /// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// /// The algorithm first builds a call graph based on the profile data and then /// iteratively merges "chains" (ordered lists) of input sections which will be /// laid out as a unit. There are two implementations for deciding how to /// merge a pair of chains: /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows /// "Optimizing Function Placement for Large-Scale Data-Center Applications" /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which /// typically produces layouts with higher locality, and hence, yields fewer /// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" #include "InputFiles.h" #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" #include "llvm/Transforms/Utils/CodeLayout.h" #include <numeric> usingnamespacellvm; usingnamespacelld; usingnamespacelld::elf; namespace { struct Edge { … }; struct Cluster { … }; /// Implementation of the Call-Chain Clustering (C^3). The goal of this /// algorithm is to improve runtime performance of the executable by arranging /// code sections such that page table and i-cache misses are minimized. /// /// Definitions: /// * Cluster /// * An ordered list of input sections which are laid out as a unit. At the /// beginning of the algorithm each input section has its own cluster and /// the weight of the cluster is the sum of the weight of all incoming /// edges. /// * Call-Chain Clustering (C³) Heuristic /// * Defines when and how clusters are combined. Pick the highest weighted /// input section then add it to its most likely predecessor if it wouldn't /// penalize it too much. /// * Density /// * The weight of the cluster divided by the size of the cluster. This is a /// proxy for the amount of execution time spent per byte of the cluster. /// /// It does so given a call graph profile by the following: /// * Build a weighted call graph from the call graph profile /// * Sort input sections by weight /// * For each input section starting with the highest weight /// * Find its most likely predecessor cluster /// * Check if the combined cluster would be too large, or would have too low /// a density. /// * If not, then combine the clusters. /// * Sort non-empty clusters by density class CallGraphSort { … }; // Maximum amount the combined cluster density can be worse than the original // cluster to consider merging. constexpr int MAX_DENSITY_DEGRADATION = …; // Maximum cluster size in bytes. constexpr uint64_t MAX_CLUSTER_SIZE = …; } // end anonymous namespace SectionPair; // Take the edge list in Config->CallGraphProfile, resolve symbol names to // Symbols, and generate a graph between InputSections with the provided // weights. CallGraphSort::CallGraphSort() { … } // It's bad to merge clusters which would degrade the density too much. static bool isNewDensityBad(Cluster &a, Cluster &b) { … } // Find the leader of V's belonged cluster (represented as an equivalence // class). We apply union-find path-halving technique (simple to implement) in // the meantime as it decreases depths and the time complexity. static int getLeader(int *leaders, int v) { … } static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, Cluster &from, int fromIdx) { … } // Group InputSections into clusters using the Call-Chain Clustering heuristic // then sort the clusters by density. DenseMap<const InputSectionBase *, int> CallGraphSort::run() { … } // Sort sections by the profile data using the Cache-Directed Sort algorithm. // The placement is done by optimizing the locality by co-locating frequently // executed code sections together. DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() { … } // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections // according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { … }