//===- LoopFusion.cpp - Code to perform loop fusion -----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements affine fusion. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/LoopFusionUtils.h" #include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <iomanip> #include <optional> #include <sstream> namespace mlir { namespace affine { #define GEN_PASS_DEF_AFFINELOOPFUSION #include "mlir/Dialect/Affine/Passes.h.inc" } // namespace affine } // namespace mlir #define DEBUG_TYPE … usingnamespacemlir; usingnamespacemlir::affine; namespace { /// Loop fusion pass. This pass currently supports a greedy fusion policy, /// which fuses loop nests with single-writer/single-reader memref dependences /// with the goal of improving locality. // TODO: Support fusion of source loop nests which write to multiple // memrefs, where each memref can have multiple users (if profitable). struct LoopFusion : public affine::impl::AffineLoopFusionBase<LoopFusion> { … }; } // namespace /// Returns true if node 'srcId' can be removed after fusing it with node /// 'dstId'. The node can be removed if any of the following conditions are met: /// 1. 'srcId' has no output dependences after fusion and no escaping memrefs. /// 2. 'srcId' has no output dependences after fusion, has escaping memrefs /// and the fusion slice is maximal. /// 3. 'srcId' has output dependences after fusion, the fusion slice is /// maximal and the fusion insertion point dominates all the dependences. static bool canRemoveSrcNodeAfterFusion( unsigned srcId, unsigned dstId, const ComputationSliceState &fusionSlice, Operation *fusedLoopInsPoint, const DenseSet<Value> &escapingMemRefs, MemRefDependenceGraph *mdg) { … } /// Returns in 'srcIdCandidates' the producer fusion candidates for consumer /// 'dstId'. Candidates are sorted by node id order. This order corresponds to /// the program order when the 'mdg' is created. However, program order is not /// guaranteed and must not be required by the client. Program order won't be /// held if the 'mdg' is reused from a previous fusion step or if the node /// creation order changes in the future to support more advance cases. // TODO: Move this to a loop fusion utility once 'mdg' is also moved. static void getProducerCandidates(unsigned dstId, MemRefDependenceGraph *mdg, SmallVectorImpl<unsigned> &srcIdCandidates) { … } /// Returns in 'producerConsumerMemrefs' the memrefs involved in a /// producer-consumer dependence between 'srcId' and 'dstId'. static void gatherProducerConsumerMemrefs(unsigned srcId, unsigned dstId, MemRefDependenceGraph *mdg, DenseSet<Value> &producerConsumerMemrefs) { … } /// A memref escapes in the context of the fusion pass if either: /// 1. it (or its alias) is a block argument, or /// 2. created by an op not known to guarantee alias freedom, /// 3. it (or its alias) are used by ops other than affine dereferencing ops /// (e.g., by call op, memref load/store ops, alias creating ops, unknown ops, /// terminator ops, etc.); such ops do not deference the memref in an affine /// way. static bool isEscapingMemref(Value memref, Block *block) { … } /// Returns in 'escapingMemRefs' the memrefs from affine store ops in node 'id' /// that escape the block or are accessed in a non-affine way. static void gatherEscapingMemrefs(unsigned id, MemRefDependenceGraph *mdg, DenseSet<Value> &escapingMemRefs) { … } // Sinks all sequential loops to the innermost levels (while preserving // relative order among them) and moves all parallel loops to the // outermost (while again preserving relative order among them). // This can increase the loop depth at which we can fuse a slice, since we are // pushing loop carried dependence to a greater depth in the loop nest. static void sinkSequentialLoops(MemRefDependenceGraph::Node *node) { … } // Creates and returns a private (single-user) memref for fused loop rooted // at 'forOp', with (potentially reduced) memref size based on the // MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'. // TODO: consider refactoring the common code from generateDma and // this one. static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst, unsigned dstLoopDepth, std::optional<unsigned> fastMemorySpace, uint64_t localBufSizeThreshold) { … } /// Walking from node 'srcId' to node 'dstId' (exclusive of 'srcId' and /// 'dstId'), if there is any non-affine operation accessing 'memref', return /// true. Otherwise, return false. static bool hasNonAffineUsersOnThePath(unsigned srcId, unsigned dstId, Value memref, MemRefDependenceGraph *mdg) { … } /// Check whether a memref value in node 'srcId' has a non-affine that /// is between node 'srcId' and node 'dstId' (exclusive of 'srcNode' and /// 'dstNode'). static bool hasNonAffineUsersOnThePath(unsigned srcId, unsigned dstId, MemRefDependenceGraph *mdg) { … } // Checks the profitability of fusing a backwards slice of the loop nest // surrounding 'srcOpInst' into the loop nest surrounding 'dstLoadOpInsts'. // The argument 'srcStoreOpInst' is used to calculate the storage reduction on // the memref being produced and consumed, which is an input to the cost model. // For producer-consumer fusion, 'srcStoreOpInst' will be the same as // 'srcOpInst', as we are slicing w.r.t to that producer. For input-reuse // fusion, 'srcOpInst' will be the src loop nest LoadOp which reads from the // same memref as dst loop nest load ops, and 'srcStoreOpInst' will be the // unique store op in the src node, which will be used to check that the write // region is the same after input-reuse fusion. Computation slices are provided // in 'depthSliceUnions' for each legal fusion depth. The maximal depth at which // fusion is legal is provided in 'maxLegalFusionDepth'. Returns true if it is // profitable to fuse the candidate loop nests. Returns false otherwise. // `dstLoopDepth` is set to the most profitable depth at which to materialize // the source loop nest slice. // The profitability model executes the following steps: // *) Computes the backward computation slice at 'srcOpInst'. This // computation slice of the loop nest surrounding 'srcOpInst' is // represented by modified src loop bounds in 'sliceState', which are // functions of loop IVs in the loop nest surrounding 'srcOpInst'. // *) Computes the cost of unfused src/dst loop nests (currently the cost of a // loop nest is the total number of dynamic operation instances in the loop // nest). // *) Computes the cost of fusing a slice of the src loop nest into the dst // loop nest at various values of dst loop depth, attempting to fuse // the largest computation slice at the maximal dst loop depth (closest to // the load) to minimize reuse distance and potentially enable subsequent // load/store forwarding. // NOTE: 'dstLoopDepth' refers to the loop depth within the destination loop // nest, at which the src computation slice is inserted/fused. // NOTE: We attempt to maximize the dst loop depth, but there are cases // where a particular setting for 'dstLoopNest' might fuse an unsliced // loop (within the src computation slice) at a depth which results in // excessive recomputation (see unit tests for examples). // *) Compares the total cost of the unfused loop nests to the min cost fused // loop nest computed in the previous step, and returns true if the latter // is lower. // TODO: Extend profitability analysis to support scenarios with multiple // stores. static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst, AffineForOp dstForOp, ArrayRef<ComputationSliceState> depthSliceUnions, unsigned maxLegalFusionDepth, unsigned *dstLoopDepth, double computeToleranceThreshold) { … } namespace { // GreedyFusion greedily fuses loop nests which have a producer/consumer or // input-reuse relationship on a memref, with the goal of improving locality. // // The steps of the producer-consumer fusion algorithm are as follows: // // *) A worklist is initialized with node ids from the dependence graph. // *) For each node id in the worklist: // *) Pop an AffineForOp of the worklist. This 'dstAffineForOp' will be a // candidate destination AffineForOp into which fusion will be attempted. // *) Add each LoadOp currently in 'dstAffineForOp' into list 'dstLoadOps'. // *) For each LoadOp in 'dstLoadOps' do: // *) Look up dependent loop nests which have a single store op to the same // memref. // *) Check if dependences would be violated by the fusion. // *) Get a computation slice of 'srcLoopNest', which adjusts its loop // bounds to be functions of 'dstLoopNest' IVs and symbols. // *) Fuse the 'srcLoopNest' computation slice into the 'dstLoopNest', // at a loop depth determined by the cost model in 'isFusionProfitable'. // *) Add the newly fused load/store operations to the state, // and also add newly fused load ops to 'dstLoopOps' to be considered // as fusion dst load ops in another iteration. // *) Remove old src loop nest and its associated state. // // The steps of the input-reuse fusion algorithm are as follows: // // *) Initialize 'worklist' with node ids from the dependence graph. // *) For each 'dstNode' in the worklist: // *) Find a candidate sibling node 'sibNode' to fuse with 'dstNode' which // loads from the same memref, but which has no dependence paths to/from. // *) Get a computation slice of 'sibLoopNest', which adjusts its loop // bounds to be functions of 'dstLoopNest' IVs and symbols. // *) Fuse the 'sibLoopNest' computation slice into the 'dstLoopNest', // at a loop depth determined by the cost model in 'isFusionProfitable'. // This function also checks that the memref write region of 'sibLoopNest', // is preserved in the fused loop nest. // *) Update graph state to reflect the fusion of 'sibNode' into 'dstNode'. // // Given a graph where top-level operations are vertices in the set 'V' and // edges in the set 'E' are dependences between vertices, this algorithm // takes O(V) time for initialization, and has runtime O(V + E). // // This greedy algorithm is not 'maximal' due to the current restriction of // fusing along single producer consumer edges, but there is a TODO: to fix // this. // // TODO: Experiment with other fusion policies. struct GreedyFusion { … }; } // namespace /// Run fusion on `block`. void LoopFusion::runOnBlock(Block *block) { … } void LoopFusion::runOnOperation() { … } std::unique_ptr<Pass> mlir::affine::createLoopFusionPass( unsigned fastMemorySpace, uint64_t localBufSizeThreshold, bool maximalFusion, enum FusionMode affineFusionMode) { … }