//===- Tiling.cpp - Implementation of tiling using TilingInterface -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the tiling using TilingInterface. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/SCF/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "mlir/Interfaces/TilingInterface.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include <optional> #define DEBUG_TYPE … usingnamespacemlir; scf::SCFTilingOptions & scf::SCFTilingOptions::setTileSizes(ArrayRef<OpFoldResult> ts) { … } scf::SCFTilingOptions & scf::SCFTilingOptions::setNumThreads(ArrayRef<OpFoldResult> nt) { … } /// Helper method to adjust the interchange vector to match the iteration /// domain. static SmallVector<int64_t> fillInterchangeVector(ArrayRef<int64_t> interchangeVector, size_t iterationDomainSize) { … } //===----------------------------------------------------------------------===// // tileUsingSCF implementation. //===----------------------------------------------------------------------===// /// Verify the tile size options are set in a consistent manner. static LogicalResult verifyTileSizeOptions(RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options) { … } /// Method to instantiate the tile sizes and/or number of threads specified /// by the user. static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>> getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op, ArrayRef<Range> iterationDomain, const scf::SCFTilingOptions &options) { … } /// Checks if any of the tiled loops are not parallel. static void checkSafeToTileToForall(TilingInterface op, ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads) { … } /// Check if `stride` evenly divides the trip count `size - offset`. static bool tileDividesIterationDomain(Range loopRange) { … } /// Returns the bounded tile size given the current `offset`, `loopRange` and /// `tileSize`, i.e., `min(tileSize, range.end() - offset)`. static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, Range loopRange, OpFoldResult offset, OpFoldResult tileSize) { … } /// Returns true if the maximum tile offset `tileSize * numThreads-1` is less /// than `iterationSize`. static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, OpFoldResult numThreads, OpFoldResult iterationSize) { … } /// Compute the `OpFoldResult`s that represents the multi-dimensional /// `offset`s and `size`s of the tile of the iteration space that the /// innermost loop body of the generated tiled loops corresponds to. static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>> getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, ArrayRef<Range> iterationDomain, ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads) { … } /// Function to return the bounds of the loops to be generated. static std::tuple<SmallVector<OpFoldResult>, SmallVector<OpFoldResult>, SmallVector<OpFoldResult>> getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes) { … } /// A function that allows returning additional yielded values during /// `yieldTiledValuesAndReplace`. /// - `ivs` induction variable for the loop. /// - `newBbArgs` basic block arguments corresponding to newly added iter_args. /// - `tiledValues` the tiled values to return. Must be of same size as /// `newbbArgs`, each element of this array is inserted into the corresponding /// element in `newbbArgs`. /// - `resultOffsets` is of the same size as `tiledValues` and represents /// the offsets to use when inserting corresponding element from `tiledValues` /// into the element from `newBbArgs`. /// - `resultSizes` is of the same size as `tiledValues` and represents /// the size of the corresponding element from `tiledValues` inserted into /// the element from `newBbArgs`. /// In case the method needs to return `failure()` the method is expected /// to clean up any inserted operations. YieldTiledValuesFn; /// Clones the operation and updates the destination if the operation /// implements the `DestinationStyleOpInterface`. static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter, Operation *op, ValueRange newDestArgs) { … } /// Generate the tile-loop nest using `scf.for` operation. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. /// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. /// - `destinationTensors` are the init values to use for the outer most loop. /// - `yieldTiledValuesFn` is called to generated the loop body of the inner /// most /// loop. /// - `loops` is an in-out parameter into which the generated loops are /// populated. static LogicalResult generateLoopNestUsingForOp( RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes, ValueRange destinationTensors, YieldTiledValuesFn yieldTiledValuesFn, SmallVector<LoopLikeOpInterface> &loops) { … } /// Generate the tile-loop nest using `scf.forall` operation. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. /// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. /// - `destinationTensors` are the init values to use for the outer most loop. /// - `mappingVector` is the mapping attributes to use for loop construction. /// Can be empty. /// - `yieldTiledValuesFn` is called to generated the loop body of the inner /// most /// loop. /// - `loops` is an in-out parameter into which the generated loops are /// populated. static LogicalResult generateLoopNestUsingForallOp( RewriterBase &rewriter, Location loc, ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads, ArrayRef<Attribute> mappingVector, ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) { … } /// Generate the tile-loop nest using the loop construct specifed in `options`. /// - `options`: Tiling options specified. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. /// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. /// - `destinationTensors` are the init values to use for the outer most loop. /// - `yieldTiledValuesFn` is called to generated the loop body of the inner /// most /// loop. /// - `loops` is an in-out parameter into which the generated loops are /// populated. static LogicalResult generateLoopNest( RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options, ArrayRef<Range> loopRanges, ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> numThreads, ValueRange destinationTensors, YieldTiledValuesFn tiledBodyFn, SmallVector<LoopLikeOpInterface> &loops) { … } /// Append the specified additional `newInitOperands` operands to the /// loops existing `init` operands (or similar), and replace `loopOp` with /// the new loop that has the additional init operands. The loop body of /// this loop is moved over to the new loop. `yieldTiledValuesFn` /// is called to get the new tiled values returned, and the offset /// and sizes at which the tiled value is inserted into the /// new region iter_args that correspond to the newly added init operands. template <typename LoopType> FailureOr<LoopLikeOpInterface> yieldTiledValuesAndReplaceLoop(LoopType loopOp, RewriterBase &rewriter, ValueRange newInitOperands, YieldTiledValuesFn yieldTiledValuesFn) { … } /// Implementation of `yieldTiledValuesAndReplaceLoop` for `scf.for`. template <> FailureOr<LoopLikeOpInterface> yieldTiledValuesAndReplaceLoop<scf::ForOp>( scf::ForOp loopOp, RewriterBase &rewriter, ValueRange newInitOperands, YieldTiledValuesFn yieldTiledValuesFn) { … } /// Implementation of `yieldTiledValuesAndReplaceLoop` for `scf.forall` template <> FailureOr<LoopLikeOpInterface> yieldTiledValuesAndReplaceLoop<scf::ForallOp>( scf::ForallOp loopOp, RewriterBase &rewriter, ValueRange newInitOperands, YieldTiledValuesFn yieldTiledValuesFn) { … } /// Implementation of `yieldTiledValuesAndReplaceLoop` for /// `LoopLikeOpInterface`, that just dispatches to the implementation for each /// supported loop type. FailureOr<LoopLikeOpInterface> yieldTiledValuesAndReplaceLoop( LoopLikeOpInterface loopLikeOp, RewriterBase &rewriter, ValueRange newInitOperands, YieldTiledValuesFn yieldTiledValuesFn) { … } /// Method to add new init values to a loop nest. Updates `loops` in-place with /// new loops that use the `newInitValues`. /// The outer-loops are updated to yield the new result values of the inner /// loop. For the innermost loop, the call back `getNewYields` is invoked to get /// the additional values to yield form the innermost loop. static LogicalResult addInitOperandsToLoopNest( RewriterBase &rewriter, MutableArrayRef<LoopLikeOpInterface> loops, ValueRange newInitValues, YieldTiledValuesFn getNewTiledYieldsFn) { … } /// Implementation of tiling transformation of `op` that implements the /// `TilingInterface` using `scf.for` to iterate over the tiles. FailureOr<scf::SCFTilingResult> mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, const scf::SCFTilingOptions &options) { … } FailureOr<scf::SCFReductionTilingResult> mlir::scf::tileReductionUsingScf(RewriterBase &b, PartialReductionOpInterface op, ArrayRef<OpFoldResult> tileSizes) { … } //===----------------------------------------------------------------------===// // tileConsumerAndFuseProducersUsingSCF implementation. //===----------------------------------------------------------------------===// /// Return the untiled producer whose slice is used in a tiled consumer. The /// method traverses the tile loop nest (`loops`) if needed, and returns the /// `iter_args` of the outer most that is encountered. Traversing the iter_args /// indicates that this is a destination operand of the consumer. If there was /// no loop traversal needed, the second value of the returned tuple is empty. static std::tuple<OpResult, std::optional<OpOperand *>> getUntiledProducerFromSliceSource(OpOperand *source, ArrayRef<LoopLikeOpInterface> loops) { … } /// Implementation of fusing producer of a single slice by computing the /// slice of the producer in-place. std::optional<scf::SCFFuseProducerOfSliceResult> mlir::scf::tileAndFuseProducerOfSlice( RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp, MutableArrayRef<LoopLikeOpInterface> loops) { … } /// Reconstruct the fused producer from within the tiled-and-fused code. FailureOr<SmallVector<Operation *>> mlir::scf::yieldReplacementForFusedProducer( RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp, scf::SCFFuseProducerOfSliceResult fusedProducerInfo, MutableArrayRef<LoopLikeOpInterface> loops, ArrayRef<unsigned> yieldResultNumber) { … } /// Implementation of tile consumer and fuse producer greedily. FailureOr<scf::SCFTileAndFuseResult> mlir::scf::tileConsumerAndFuseProducersUsingSCF( RewriterBase &rewriter, TilingInterface consumer, const scf::SCFTileAndFuseOptions &options) { … } //===----------------------------------------------------------------------===// // tileAndFuseConsumerUsingSCF implementation. //===----------------------------------------------------------------------===// /// A utility function that checks whether the only use of the result of a /// tensor.insert_slice op is in a scf.yield op. static LogicalResult checkAssumptionForFusingConsumer(tensor::InsertSliceOp candidateSliceOp) { … } /// Fetches the OpOperand of the only user (and use) of the value `val` which /// implements `TilingInterface` and `DestinationStyleOpInterface`. Returns /// failure otherwise. static FailureOr<OpOperand *> getConsumerFromUses(Value val, Block *containingOpBlock) { … } /// Find the perfectly nested loops outside of given loop(included) sorted from /// outer to inner. /// /// E.g. /// /// ``` /// %0 = scf.for() /// %1 = scf.for() /// %2 = scf.for() /// %3 = ... /// yield %3 /// yield %2 /// yield %1 /// ``` /// /// This function will return three perfectly nested loops: %0 + %1 + %2, when /// target inner loop is %2. static SmallVector<scf::ForOp> getPerfectlyNestedLoopsOutsideOf(scf::ForOp loop) { … } /// Fetch the untiled consumer of a scf.for's result which is yielded by a /// tensor.insert_slice. This function makes the following assumptions : /// 1. tensor.insert_slice has scf.yield as its only user. /// 2. scf.for's corresponding result has only one use. static FailureOr<OpOperand *> getUntiledConsumerFromSlice(tensor::InsertSliceOp candidateSliceOp) { … } /// Fetch the first untiled consumer of a scf.forall's result which is yielded /// by a tensor.parallel_insert_slice. static FailureOr<OpOperand *> getUntiledConsumerFromSlice(tensor::ParallelInsertSliceOp candidateSliceOp) { … } /// This utility currently checks whether the loop either :- /// 1. Yields exactly one result. /// 2. Has consumer op as its first user and other users to be in the same /// containing block as that of consumer op's. Currently we clone the loop op /// right before the consumer op in order to maintain a valid def-use chain. /// This utility thus helps ensuring that no invalid IR is formed due to the /// same. static LogicalResult checkAssumptionForLoop(Operation *loopOp, Operation *consumerOp) { … } /// A utility to fetch an untiled consumer of /// tensor.insert_slice/tensor.parallel_insert_slice. static FailureOr<OpOperand *> getUntiledConsumerFromSlice(Operation *sliceOp) { … } /// Implementation of fusing consumer of a single slice by computing the /// slice of the consumer in-place for scf loop. FailureOr<scf::SCFFuseConsumerOfSliceResult> mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp) { … } //===----------------------------------------------------------------------===// // lowerToLoopsUsingSCFForOp implementation. //===----------------------------------------------------------------------===// FailureOr<SmallVector<scf::ForOp>> mlir::scf::lowerToLoopsUsingSCFForOp(RewriterBase &rewriter, TilingInterface op) { … }