//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements miscellaneous loop transformation routines. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/LoopUtils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/IntegerSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include <optional> #define DEBUG_TYPE … usingnamespacemlir; usingnamespaceaffine; usingnamespacepresburger; SmallMapVector; /// Computes the cleanup loop lower bound of the loop being unrolled with /// the specified unroll factor; this bound will also be upper bound of the main /// part of the unrolled loop. Computes the bound as an AffineMap with its /// operands or a null map when the trip count can't be expressed as an affine /// expression. static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor, AffineMap &cleanupLbMap, SmallVectorImpl<Value> &cleanupLbOperands) { … } /// Helper to replace uses of loop carried values (iter_args) and loop /// yield values while promoting single iteration affine.for ops. static void replaceIterArgsAndYieldResults(AffineForOp forOp) { … } /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { … } /// Generates an affine.for op with the specified lower and upper bounds /// while generating the right IV remappings to realize shifts for operations in /// its body. The operations that go into the loop body are specified in /// opGroupQueue starting from the specified offset, and in that order. The /// first element of the pair specifies the shift applied to that group of /// operations; the shift is multiplied by the loop step before being applied. /// Returns nullptr if the generated loop simplifies to a single iteration one. static AffineForOp generateShiftedLoop( AffineMap lbMap, AffineMap ubMap, const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue, unsigned offset, AffineForOp srcForOp, OpBuilder b) { … } // The skewing of operations with respect to one another can be used for // example to allow overlap of asynchronous operations (such as DMA // communication) with computation, or just relative shifting of operations // for better register reuse, locality or parallelism. As such, the shifts are // typically expected to be at most of the order of the number of operations. // This method should not be used as a substitute for loop distribution/fission. // This method uses an algorithm// in time linear in the number of operations // in the body of the for loop - (using the 'sweep line' paradigm). This method // asserts preservation of SSA dominance. A check for that as well as that for // memory-based dependence preservation check rests with the users of this // method. LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts, bool unrollPrologueEpilogue) { … } /// Checks whether a loop nest is hyper-rectangular or not. static LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) { … } /// Check if the input nest is supported for tiling and whether tiling would be /// legal or not. template <typename t> static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input, ArrayRef<t> tileSizes) { … } /// Move the loop body of AffineForOp 'src' from 'src' into the specified /// location in destination's body, ignoring the terminator. static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest, Block::iterator loc) { … } /// Move the loop body of AffineForOp 'src' from 'src' to the start of dest /// body. static void moveLoopBody(AffineForOp src, AffineForOp dest) { … } /// Constructs tiled loop nest, without setting the loop bounds and move the /// body of the original loop nest to the tiled loop nest. static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops, AffineForOp rootAffineForOp, unsigned width, MutableArrayRef<AffineForOp> tiledLoops) { … } /// Set lower and upper bounds of intra-tile loops for parametric tiling. // TODO: Handle non-constant lower bounds. static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newInterTileLoop, AffineForOp newIntraTileLoop, Value tileSize) { … } /// Set lower and upper bounds of inter-tile loops for parametric tiling. // TODO: Handle non-constant lower bounds. static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop, AffineForOp newLoop, Value tileSize) { … } /// Constructs and sets new loop bounds after tiling for the case of /// hyper-rectangular index sets, where the bounds of one dimension do not /// depend on other dimensions and tiling parameters are captured from SSA /// values. Bounds of each dimension can thus be treated independently, /// and deriving the new bounds is much simpler and faster than for the case of /// tiling arbitrary polyhedral shapes. static void constructParametricallyTiledIndexSetHyperRect( MutableArrayRef<AffineForOp> origLoops, MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) { … } /// Constructs and sets new loop bounds after tiling for the case of /// hyper-rectangular index sets, where the bounds of one dimension do not /// depend on other dimensions. Bounds of each dimension can thus be treated /// independently, and deriving the new bounds is much simpler and faster /// than for the case of tiling arbitrary polyhedral shapes. static void constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops, MutableArrayRef<AffineForOp> newLoops, ArrayRef<unsigned> tileSizes) { … } LogicalResult mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input, ArrayRef<unsigned> tileSizes, SmallVectorImpl<AffineForOp> *tiledNest) { … } /// Tiles the specified band of perfectly nested loops creating tile-space /// loops and intra-tile loops, using SSA values as tiling parameters. A band /// is a contiguous set of loops. LogicalResult mlir::affine::tilePerfectlyNestedParametric( MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes, SmallVectorImpl<AffineForOp> *tiledNest) { … } /// Get perfectly nested sequence of loops starting at root of loop nest /// (the first op being another AffineFor, and the second op - a terminator). /// A loop is perfectly nested iff: the first op in the loop's body is another /// AffineForOp, and the second op is a terminator). void mlir::affine::getPerfectlyNestedLoops( SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) { … } /// Identify valid and profitable bands of loops to tile. This is currently just /// a temporary placeholder to test the mechanics of tiled code generation. /// Returns all maximal outermost perfect loop nests to tile. void mlir::affine::getTileableBands( func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) { … } /// Unrolls this loop completely. LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { … } /// Unrolls this loop by the specified factor or by the trip count (if constant) /// whichever is lower. LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp, uint64_t unrollFactor) { … } /// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated /// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each /// unrolled body. If specified, annotates the Ops in each unrolled iteration /// using annotateFn. static void generateUnrolledLoop( Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor, function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn, function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, ValueRange iterArgs, ValueRange yieldedValues) { … } /// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip /// count is not a multiple of `unrollFactor`. static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp, uint64_t unrollFactor) { … } /// Unrolls this loop by the specified factor. Returns success if the loop /// is successfully unrolled. LogicalResult mlir::affine::loopUnrollByFactor( AffineForOp forOp, uint64_t unrollFactor, function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn, bool cleanUpUnroll) { … } LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp, uint64_t unrollJamFactor) { … } /// Check if all control operands of all loops are defined outside of `forOp` /// and return false if not. static bool areInnerBoundsInvariant(AffineForOp forOp) { … } /// Unrolls and jams this loop by the specified factor. LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp, uint64_t unrollJamFactor) { … } /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is /// nested within 'forOpA' as the only non-terminator operation in its block. void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { … } // Checks each dependence component against the permutation to see if the // desired loop interchange would violate dependences by making the // dependence component lexicographically negative. static bool checkLoopInterchangeDependences( const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec, ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { … } /// Checks if the loop interchange permutation 'loopPermMap' of the perfectly /// nested sequence of loops in 'loops' would violate dependences. bool mlir::affine::isValidLoopInterchangePermutation( ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { … } /// Returns true if `loops` is a perfectly nested loop nest, where loops appear /// in it from outermost to innermost. bool LLVM_ATTRIBUTE_UNUSED mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) { … } // input[i] should move from position i -> permMap[i]. Returns the position in // `input` that becomes the new outermost loop. unsigned mlir::affine::permuteLoops(MutableArrayRef<AffineForOp> input, ArrayRef<unsigned> permMap) { … } // Sinks all sequential loops to the innermost levels (while preserving // relative order among them) and moves all parallel loops to the // outermost (while again preserving relative order among them). AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) { … } // Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the // lower (resp. upper) loop bound. When called for both the lower and upper // bounds, the resulting IR resembles: // // ```mlir // affine.for %i = max (`iv, ...) to min (`iv` + `offset`) { // ... // } // ``` static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map, SmallVector<Value, 4> *operands, int64_t offset = 0) { … } // Stripmines `forOp` by `factor` and sinks it under each of the `targets`. // Stripmine-sink is a primitive building block for generalized tiling of // imperfectly nested loops. // This transformation is purely mechanical and does not check legality, // profitability or even structural correctness. It is the user's // responsibility to specify `targets` that are dominated by `forOp`. // Returns the new AffineForOps, one per `targets`, nested immediately under // each of the `targets`. static SmallVector<AffineForOp, 8> stripmineSink(AffineForOp forOp, uint64_t factor, ArrayRef<AffineForOp> targets) { … } // Stripmines a `forOp` by `factor` and sinks it under a single `target`. // Returns the new AffineForOps, nested immediately under `target`. template <typename SizeType> static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor, AffineForOp target) { … } SmallVector<SmallVector<AffineForOp, 8>, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes, ArrayRef<AffineForOp> targets) { … } SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes, AffineForOp target) { … } LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) { … } void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp, ArrayRef<Value> processorId, ArrayRef<Value> numProcessors) { … } /// Given a memref region, determine the lowest depth at which transfers can be /// placed for it, and return the corresponding block, start and end positions /// in the block for placing incoming (read) and outgoing (write) copies /// respectively. The lowest depth depends on whether the region being accessed /// is hoistable with respect to one or more immediately surrounding loops. static void findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, Block::iterator &begin, Block::iterator &end, Block **copyPlacementBlock, Block::iterator *copyInPlacementStart, Block::iterator *copyOutPlacementStart) { … } // Info comprising stride and number of elements transferred every stride. struct StrideInfo { … }; /// Returns striding information for a copy/transfer of this region with /// potentially multiple striding levels from outermost to innermost. For an /// n-dimensional region, there can be at most n-1 levels of striding /// successively nested. // TODO: make this work with non-identity layout maps. static void getMultiLevelStrides(const MemRefRegion ®ion, ArrayRef<int64_t> bufferShape, SmallVectorImpl<StrideInfo> *strideInfos) { … } /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and /// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and /// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper /// bound information for the copy loop nest. `fastBufOffsets` contain the /// expressions to be subtracted out from the respective copy loop iterators in /// order to index the fast buffer. If `copyOut' is true, generates a copy-out; /// otherwise a copy-in. Builder `b` should be set to the point the copy nest is /// inserted. // /// The copy-in nest is generated as follows as an example for a 2-d region: /// for x = ... /// for y = ... /// fast_buf[x - offset_x][y - offset_y] = memref[x][y] /// static AffineForOp generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands, ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands, ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut, OpBuilder b) { … } static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED emitRemarkForBlock(Block &block) { … } /// Creates a buffer in the faster memory space for the specified memref region; /// generates a copy from the lower memory space to this one, and replaces all /// loads/stores in the block range [`begin', `end') of `block' to load/store /// from that buffer. Returns failure if copies could not be generated due to /// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart` /// in copyPlacementBlock specify the insertion points where the incoming copies /// and outgoing copies, respectively, should be inserted (the insertion happens /// right before the insertion point). Since `begin` can itself be invalidated /// due to the memref rewriting done from this method, the output argument /// `nBegin` is set to its replacement (set to `begin` if no invalidation /// happens). Since outgoing copies could have been inserted at `end`, the /// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the /// size of the fast buffer allocated. static LogicalResult generateCopy( const MemRefRegion ®ion, Block *block, Block::iterator begin, Block::iterator end, Block *copyPlacementBlock, Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart, const AffineCopyOptions ©Options, DenseMap<Value, Value> &fastBufferMap, DenseSet<Operation *> ©Nests, uint64_t *sizeInBytes, Block::iterator *nBegin, Block::iterator *nEnd) { … } /// Construct the memref region to just include the entire memref. Returns false /// dynamic shaped memref's for now. `numParamLoopIVs` is the number of /// enclosing loop IVs of `op` (starting from the outermost) that the region /// is parametric on. static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs, MemRefRegion *region) { … } LogicalResult mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, std::optional<Value> filterMemRef, DenseSet<Operation *> ©Nests) { … } // A convenience version of affineDataCopyGenerate for all ops in the body of // an AffineForOp. LogicalResult mlir::affine::affineDataCopyGenerate( AffineForOp forOp, const AffineCopyOptions ©Options, std::optional<Value> filterMemRef, DenseSet<Operation *> ©Nests) { … } LogicalResult mlir::affine::generateCopyForMemRegion( const MemRefRegion &memrefRegion, Operation *analyzedOp, const AffineCopyOptions ©Options, CopyGenerateResult &result) { … } /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { … } /// Gathers all AffineForOps in 'func.func' grouped by loop depth. void mlir::affine::gatherLoops( func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { … } AffineForOp mlir::affine::createCanonicalizedAffineForOp( OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap, ValueRange ubOperands, AffineMap ubMap, int64_t step) { … } /// Creates an AffineIfOp that encodes the conditional to choose between /// the constant trip count version and an unknown trip count version of this /// nest of loops. This is used to separate partial and full tiles if `loops` /// has the intra-tile loops. The affine.if op is inserted at the builder /// insertion point of `b`. static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops, OpBuilder b) { … } /// Create the full tile loop nest (along with its body). static LogicalResult createFullTiles(MutableArrayRef<AffineForOp> inputNest, SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) { … } LogicalResult mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest, SmallVectorImpl<AffineForOp> *fullTileNest) { … } LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) { … }