LoopUtils.cpp | Explore in Territory

//===- LoopUtils.cpp ---- Misc utilities for loop transformation ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements miscellaneous loop transformation routines.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/IntegerSet.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <optional>

#define DEBUG_TYPE …

usingnamespacemlir;
usingnamespaceaffine;
usingnamespacepresburger;
SmallMapVector;

/// Computes the cleanup loop lower bound of the loop being unrolled with
/// the specified unroll factor; this bound will also be upper bound of the main
/// part of the unrolled loop. Computes the bound as an AffineMap with its
/// operands or a null map when the trip count can't be expressed as an affine
/// expression.
static void
getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
                         AffineMap &cleanupLbMap,
                         SmallVectorImpl<Value> &cleanupLbOperands) { … }

/// Helper to replace uses of loop carried values (iter_args) and loop
/// yield values while promoting single iteration affine.for ops.
static void replaceIterArgsAndYieldResults(AffineForOp forOp) { … }

/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { … }

/// Generates an affine.for op with the specified lower and upper bounds
/// while generating the right IV remappings to realize shifts for operations in
/// its body. The operations that go into the loop body are specified in
/// opGroupQueue starting from the specified offset, and in that order. The
/// first element of the pair specifies the shift applied to that group of
/// operations; the shift is multiplied by the loop step before being applied.
/// Returns nullptr if the generated loop simplifies to a single iteration one.
static AffineForOp generateShiftedLoop(
    AffineMap lbMap, AffineMap ubMap,
    const std::vector<std::pair<uint64_t, ArrayRef<Operation *>>> &opGroupQueue,
    unsigned offset, AffineForOp srcForOp, OpBuilder b) { … }

// The skewing of operations with respect to one another can be used for
// example to allow overlap of asynchronous operations (such as DMA
// communication) with computation, or just relative shifting of operations
// for better register reuse, locality or parallelism. As such, the shifts are
// typically expected to be at most of the order of the number of operations.
// This method should not be used as a substitute for loop distribution/fission.
// This method uses an algorithm// in time linear in the number of operations
// in the body of the for loop - (using the 'sweep line' paradigm). This method
// asserts preservation of SSA dominance. A check for that as well as that for
// memory-based dependence preservation check rests with the users of this
// method.
LogicalResult mlir::affine::affineForOpBodySkew(AffineForOp forOp,
                                                ArrayRef<uint64_t> shifts,
                                                bool unrollPrologueEpilogue) { … }

/// Checks whether a loop nest is hyper-rectangular or not.
static LogicalResult
checkIfHyperRectangular(MutableArrayRef<AffineForOp> input) { … }

/// Check if the input nest is supported for tiling and whether tiling would be
/// legal or not.
template <typename t>
static LogicalResult performPreTilingChecks(MutableArrayRef<AffineForOp> input,
                                            ArrayRef<t> tileSizes) { … }

/// Move the loop body of AffineForOp 'src' from 'src' into the specified
/// location in destination's body, ignoring the terminator.
static void moveLoopBodyImpl(AffineForOp src, AffineForOp dest,
                             Block::iterator loc) { … }

/// Move the loop body of AffineForOp 'src' from 'src' to the start of dest
/// body.
static void moveLoopBody(AffineForOp src, AffineForOp dest) { … }

/// Constructs tiled loop nest, without setting the loop bounds and move the
/// body of the original loop nest to the tiled loop nest.
static void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
                                   AffineForOp rootAffineForOp, unsigned width,
                                   MutableArrayRef<AffineForOp> tiledLoops) { … }

/// Set lower and upper bounds of intra-tile loops for parametric tiling.
//  TODO: Handle non-constant lower bounds.
static void setIntraTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
                                         AffineForOp newInterTileLoop,
                                         AffineForOp newIntraTileLoop,
                                         Value tileSize) { … }

/// Set lower and upper bounds of inter-tile loops for parametric tiling.
//  TODO: Handle non-constant lower bounds.
static void setInterTileBoundsParametric(OpBuilder &b, AffineForOp origLoop,
                                         AffineForOp newLoop, Value tileSize) { … }

/// Constructs and sets new loop bounds after tiling for the case of
/// hyper-rectangular index sets, where the bounds of one dimension do not
/// depend on other dimensions and tiling parameters are captured from SSA
/// values. Bounds of each dimension can thus be treated independently,
/// and deriving the new bounds is much simpler and faster than for the case of
/// tiling arbitrary polyhedral shapes.
static void constructParametricallyTiledIndexSetHyperRect(
    MutableArrayRef<AffineForOp> origLoops,
    MutableArrayRef<AffineForOp> newLoops, ArrayRef<Value> tileSizes) { … }

/// Constructs and sets new loop bounds after tiling for the case of
/// hyper-rectangular index sets, where the bounds of one dimension do not
/// depend on other dimensions. Bounds of each dimension can thus be treated
/// independently, and deriving the new bounds is much simpler and faster
/// than for the case of tiling arbitrary polyhedral shapes.
static void
constructTiledIndexSetHyperRect(MutableArrayRef<AffineForOp> origLoops,
                                MutableArrayRef<AffineForOp> newLoops,
                                ArrayRef<unsigned> tileSizes) { … }

LogicalResult
mlir::affine::tilePerfectlyNested(MutableArrayRef<AffineForOp> input,
                                  ArrayRef<unsigned> tileSizes,
                                  SmallVectorImpl<AffineForOp> *tiledNest) { … }

/// Tiles the specified band of perfectly nested loops creating tile-space
/// loops and intra-tile loops, using SSA values as tiling parameters. A band
/// is a contiguous set of loops.
LogicalResult mlir::affine::tilePerfectlyNestedParametric(
    MutableArrayRef<AffineForOp> input, ArrayRef<Value> tileSizes,
    SmallVectorImpl<AffineForOp> *tiledNest) { … }

/// Get perfectly nested sequence of loops starting at root of loop nest
/// (the first op being another AffineFor, and the second op - a terminator).
/// A loop is perfectly nested iff: the first op in the loop's body is another
/// AffineForOp, and the second op is a terminator).
void mlir::affine::getPerfectlyNestedLoops(
    SmallVectorImpl<AffineForOp> &nestedLoops, AffineForOp root) { … }

/// Identify valid and profitable bands of loops to tile. This is currently just
/// a temporary placeholder to test the mechanics of tiled code generation.
/// Returns all maximal outermost perfect loop nests to tile.
void mlir::affine::getTileableBands(
    func::FuncOp f, std::vector<SmallVector<AffineForOp, 6>> *bands) { … }

/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { … }

/// Unrolls this loop by the specified factor or by the trip count (if constant)
/// whichever is lower.
LogicalResult mlir::affine::loopUnrollUpToFactor(AffineForOp forOp,
                                                 uint64_t unrollFactor) { … }

/// Generates unrolled copies of AffineForOp 'loopBodyBlock', with associated
/// 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap 'forOpIV' for each
/// unrolled body. If specified, annotates the Ops in each unrolled iteration
/// using annotateFn.
static void generateUnrolledLoop(
    Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
    function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
    ValueRange iterArgs, ValueRange yieldedValues) { … }

/// Helper to generate cleanup loop for unroll or unroll-and-jam when the trip
/// count is not a multiple of `unrollFactor`.
static LogicalResult generateCleanupLoopForUnroll(AffineForOp forOp,
                                                  uint64_t unrollFactor) { … }

/// Unrolls this loop by the specified factor. Returns success if the loop
/// is successfully unrolled.
LogicalResult mlir::affine::loopUnrollByFactor(
    AffineForOp forOp, uint64_t unrollFactor,
    function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
    bool cleanUpUnroll) { … }

LogicalResult mlir::affine::loopUnrollJamUpToFactor(AffineForOp forOp,
                                                    uint64_t unrollJamFactor) { … }

/// Check if all control operands of all loops are defined outside of `forOp`
/// and return false if not.
static bool areInnerBoundsInvariant(AffineForOp forOp) { … }

/// Unrolls and jams this loop by the specified factor.
LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
                                                  uint64_t unrollJamFactor) { … }

/// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is
/// nested within 'forOpA' as the only non-terminator operation in its block.
void mlir::affine::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { … }

// Checks each dependence component against the permutation to see if the
// desired loop interchange would violate dependences by making the
// dependence component lexicographically negative.
static bool checkLoopInterchangeDependences(
    const std::vector<SmallVector<DependenceComponent, 2>> &depCompsVec,
    ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { … }

/// Checks if the loop interchange permutation 'loopPermMap' of the perfectly
/// nested sequence of loops in 'loops' would violate dependences.
bool mlir::affine::isValidLoopInterchangePermutation(
    ArrayRef<AffineForOp> loops, ArrayRef<unsigned> loopPermMap) { … }

/// Returns true if `loops` is a perfectly nested loop nest, where loops appear
/// in it from outermost to innermost.
bool LLVM_ATTRIBUTE_UNUSED
mlir::affine::isPerfectlyNested(ArrayRef<AffineForOp> loops) { … }

// input[i] should move from position i -> permMap[i]. Returns the position in
// `input` that becomes the new outermost loop.
unsigned mlir::affine::permuteLoops(MutableArrayRef<AffineForOp> input,
                                    ArrayRef<unsigned> permMap) { … }

// Sinks all sequential loops to the innermost levels (while preserving
// relative order among them) and moves all parallel loops to the
// outermost (while again preserving relative order among them).
AffineForOp mlir::affine::sinkSequentialLoops(AffineForOp forOp) { … }

// Factors out common behavior to add a new `iv` (resp. `iv` + `offset`) to the
// lower (resp. upper) loop bound. When called for both the lower and upper
// bounds, the resulting IR resembles:
//
// ```mlir
//    affine.for %i = max (`iv, ...) to min (`iv` + `offset`) {
//      ...
//    }
// ```
static void augmentMapAndBounds(OpBuilder &b, Value iv, AffineMap *map,
                                SmallVector<Value, 4> *operands,
                                int64_t offset = 0) { … }

// Stripmines `forOp` by `factor` and sinks it under each of the `targets`.
// Stripmine-sink is a primitive building block for generalized tiling of
// imperfectly nested loops.
// This transformation is purely mechanical and does not check legality,
// profitability or even structural correctness. It is the user's
// responsibility to specify `targets` that are dominated by `forOp`.
// Returns the new AffineForOps, one per `targets`, nested immediately under
// each of the `targets`.
static SmallVector<AffineForOp, 8>
stripmineSink(AffineForOp forOp, uint64_t factor,
              ArrayRef<AffineForOp> targets) { … }

// Stripmines a `forOp` by `factor` and sinks it under a single `target`.
// Returns the new AffineForOps, nested immediately under `target`.
template <typename SizeType>
static AffineForOp stripmineSink(AffineForOp forOp, SizeType factor,
                                 AffineForOp target) { … }

SmallVector<SmallVector<AffineForOp, 8>, 8>
mlir::affine::tile(ArrayRef<AffineForOp> forOps, ArrayRef<uint64_t> sizes,
                   ArrayRef<AffineForOp> targets) { … }

SmallVector<AffineForOp, 8> mlir::affine::tile(ArrayRef<AffineForOp> forOps,
                                               ArrayRef<uint64_t> sizes,
                                               AffineForOp target) { … }

LogicalResult mlir::affine::coalesceLoops(MutableArrayRef<AffineForOp> loops) { … }

void mlir::affine::mapLoopToProcessorIds(scf::ForOp forOp,
                                         ArrayRef<Value> processorId,
                                         ArrayRef<Value> numProcessors) { … }

/// Given a memref region, determine the lowest depth at which transfers can be
/// placed for it, and return the corresponding block, start and end positions
/// in the block for placing incoming (read) and outgoing (write) copies
/// respectively. The lowest depth depends on whether the region being accessed
/// is hoistable with respect to one or more immediately surrounding loops.
static void
findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
                             Block::iterator &begin, Block::iterator &end,
                             Block **copyPlacementBlock,
                             Block::iterator *copyInPlacementStart,
                             Block::iterator *copyOutPlacementStart) { … }

// Info comprising stride and number of elements transferred every stride.
struct StrideInfo { … };

/// Returns striding information for a copy/transfer of this region with
/// potentially multiple striding levels from outermost to innermost. For an
/// n-dimensional region, there can be at most n-1 levels of striding
/// successively nested.
//  TODO: make this work with non-identity layout maps.
static void getMultiLevelStrides(const MemRefRegion &region,
                                 ArrayRef<int64_t> bufferShape,
                                 SmallVectorImpl<StrideInfo> *strideInfos) { … }

/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
/// bound information for the copy loop nest. `fastBufOffsets` contain the
/// expressions to be subtracted out from the respective copy loop iterators in
/// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
/// inserted.
//
/// The copy-in nest is generated as follows as an example for a 2-d region:
/// for x = ...
///   for y = ...
///     fast_buf[x - offset_x][y - offset_y] = memref[x][y]
///
static AffineForOp
generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
                      ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
                      ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
                      ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
                      OpBuilder b) { … }

static InFlightDiagnostic LLVM_ATTRIBUTE_UNUSED
emitRemarkForBlock(Block &block) { … }

/// Creates a buffer in the faster memory space for the specified memref region;
/// generates a copy from the lower memory space to this one, and replaces all
/// loads/stores in the block range [`begin', `end') of `block' to load/store
/// from that buffer. Returns failure if copies could not be generated due to
/// yet unimplemented cases. `copyInPlacementStart` and `copyOutPlacementStart`
/// in copyPlacementBlock specify the insertion points where the incoming copies
/// and outgoing copies, respectively, should be inserted (the insertion happens
/// right before the insertion point). Since `begin` can itself be invalidated
/// due to the memref rewriting done from this method, the output argument
/// `nBegin` is set to its replacement (set to `begin` if no invalidation
/// happens). Since outgoing copies could have  been inserted at `end`, the
/// output argument `nEnd` is set to the new end. `sizeInBytes` is set to the
/// size of the fast buffer allocated.
static LogicalResult generateCopy(
    const MemRefRegion &region, Block *block, Block::iterator begin,
    Block::iterator end, Block *copyPlacementBlock,
    Block::iterator copyInPlacementStart, Block::iterator copyOutPlacementStart,
    const AffineCopyOptions &copyOptions, DenseMap<Value, Value> &fastBufferMap,
    DenseSet<Operation *> &copyNests, uint64_t *sizeInBytes,
    Block::iterator *nBegin, Block::iterator *nEnd) { … }

/// Construct the memref region to just include the entire memref. Returns false
/// dynamic shaped memref's for now. `numParamLoopIVs` is the number of
/// enclosing loop IVs of `op` (starting from the outermost) that the region
/// is parametric on.
static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
                                  MemRefRegion *region) { … }

LogicalResult
mlir::affine::affineDataCopyGenerate(Block::iterator begin, Block::iterator end,
                                     const AffineCopyOptions &copyOptions,
                                     std::optional<Value> filterMemRef,
                                     DenseSet<Operation *> &copyNests) { … }

// A convenience version of affineDataCopyGenerate for all ops in the body of
// an AffineForOp.
LogicalResult mlir::affine::affineDataCopyGenerate(
    AffineForOp forOp, const AffineCopyOptions &copyOptions,
    std::optional<Value> filterMemRef, DenseSet<Operation *> &copyNests) { … }

LogicalResult mlir::affine::generateCopyForMemRegion(
    const MemRefRegion &memrefRegion, Operation *analyzedOp,
    const AffineCopyOptions &copyOptions, CopyGenerateResult &result) { … }

/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
static void
gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
                   std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { … }

/// Gathers all AffineForOps in 'func.func' grouped by loop depth.
void mlir::affine::gatherLoops(
    func::FuncOp func, std::vector<SmallVector<AffineForOp, 2>> &depthToLoops) { … }

AffineForOp mlir::affine::createCanonicalizedAffineForOp(
    OpBuilder b, Location loc, ValueRange lbOperands, AffineMap lbMap,
    ValueRange ubOperands, AffineMap ubMap, int64_t step) { … }

/// Creates an AffineIfOp that encodes the conditional to choose between
/// the constant trip count version and an unknown trip count version of this
/// nest of loops. This is used to separate partial and full tiles if `loops`
/// has the intra-tile loops. The affine.if op is inserted at the builder
/// insertion point of `b`.
static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
                                            OpBuilder b) { … }

/// Create the full tile loop nest (along with its body).
static LogicalResult
createFullTiles(MutableArrayRef<AffineForOp> inputNest,
                SmallVectorImpl<AffineForOp> &fullTileLoops, OpBuilder b) { … }

LogicalResult
mlir::affine::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
                                SmallVectorImpl<AffineForOp> *fullTileNest) { … }

LogicalResult affine::coalescePerfectlyNestedAffineLoops(AffineForOp op) { … }
llvm/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp