llvm/mlir/include/mlir/Dialect/Tensor/Transforms/TransformUtils.h

//===- TransformsUtils.h - Tensor Transformation Utilities-------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_DIALECT_TENSOR_TRANSFORMS_TRANSFORMUTILS_H
#define MLIR_DIALECT_TENSOR_TRANSFORMS_TRANSFORMUTILS_H

#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/PatternMatch.h"

namespace mlir {
namespace tensor {

//===----------------------------------------------------------------------===//
// Extract slice from `tensor.collapse_shape`
//===----------------------------------------------------------------------===//

/// This class assists with generating IR required to materialize an
/// arbitrary-sized slice from the result of a CollapseShapeOp. In order to
/// accomplish this, a loop nest or similar operation must be created by the
/// caller. The purpose of the loop nest is to generate a "tiling by 1" of all
/// sliced dimensions. The "tiling by 1" assembles all elements of the result
/// tile over dimensions that would have been impossible to directly slice.
///
/// The class provides three methods:
/// 1. `ExtractSliceFromCollapseHelper::create`: emits IR that should
/// appear before the loop nest and populates the internal state.
/// 2. `ExtractSliceFromCollapseHelper::getIterationSpaceSizes`: returns
/// parameters used by the caller to construct the loop nest.
/// 3. `ExtractSliceFromCollapseHelper::emitLoopNestBody`:
/// emits IR to construct a "size-1 tile" of the desired result and returns a
/// set of ranges where the tile should be inserted into the destination
/// tensor.
///
/// ### Intended usage:
///
/// The caller should first call `ExtractSliceFromCollapseHelper::create` and
/// then create a destination tensor that is the same size as the desired slice.
/// The caller then creates a loop nest that iterates over the multi-dimensional
/// iteration space defined by `[0, ub[0]) x [0, ub[1]] x ... x [0, ub[N-1]]`
/// where `ub` is the upper bound given by
/// `ExtractSliceFromCollapseHelper::getIterationSpaceSizes`. Inside the body of
/// the loop nest, the caller should call
/// `ExtractSliceFromCollapseHelper::emitLoopNestBody` and provide the induction
/// variables. This returns a sub-tile and a set of ranges that describe where
/// this tile should be inserted into the result by the caller. For a complete
/// example of usage, see the examples in the TestTensorTransforms pass.
///
/// ### Example:
/// Consider the following IR:
/// ```
/// %0 = linalg.generic ... -> tensor<3x?x?x11x?xf32>
/// %1 = tensor.collapse_shape %0 [[0, 1, 2], [3, 4]]
///        : tensor<3x?x?x11x?xf32> into tensor<?x?xf32>
/// %2 = tensor.extract_slice %1 [%offt0, %offt1][%size0, %size1][1, 1]
///        : tensor<?x?xf32> to tensor<?x?xf32>
/// ```
///
/// We can construct %2 by generating the following, which only uses `%0`:
///
/// ```
/// %dest = tensor.empty(%size0, %size1) : tensor<?x?xf32>
/// %1 = tensor.dim %0, %c1 : tensor<3x?x?x11x?xf32>
/// %2 = tensor.dim %0, %c2 : tensor<3x?x?x11x?xf32>
/// %3 = tensor.dim %0, %c4 : tensor<3x?x?x11x?xf32>
///
/// %result = scf.for %iv0 = %c0 to %arg2 step %c1 iter_args(%arg6 = %dest) ->
///                                                  (tensor<?x?xf32>) {
///   %5 = scf.for %iv1 = %c0 to %arg4 step %c1 iter_args(%arg8 = %arg6)
///                                                  -> (tensor<?x?xf32>) {
///     %lin0 = (affine.apply) %iv0 + %offt0
///     %lin1 = (affine.apply) %iv1 + %offt1
///
///     %mi0:3 = affine.delinearize_index %lin0 into (%c3, %1, %2)
///     %mi1:2 = affine.delinearize_index %lin1 into (%c11, %3)
///
///     %sub_tile = tensor.extract_slice %0
///                    [%mi0#0, %mi0#1, %mi0#2, %mi1#0, %mi1#1]
///                    [1, 1, 1, 1, 1]
///                    [1, 1, 1, 1, 1]
///            : tensor<3x?x?x11x?xf32> to tensor<1x1x1x1x1xf32>
///     %sub_tile_collapsed = tensor.collapse_shape %sub_tile
///             [[0, 1, 2], [3, 4]]
///            : tensor<1x1x1x1x1xf32> into tensor<1x1xf3
///
///     %12 = tensor.insert_slice %sub_tile_collapsed into
///             %arg8[%iv0, %iv1] [1, 1] [1, 1]
///             : tensor<1x1xf32> into tensor<?x?xf32>
///     scf.yield %12 : tensor<?x?xf32>
///   }
///   scf.yield %5 : tensor<?x?xf32>
/// }
/// ```
///
/// ### Explanation of example:
///
/// Each step above is explained below.
///
/// #### Step 0: Create %dest and materialization of shapes.
/// This step is self-explanatory and performed by the caller. It can be
/// done before or after calling `ExtractSliceFromCollapseHelper::create`,
/// which materializes the source shape (`%0, %1, %2`).
///
/// #### Step 1: Create loop nest.
///
/// The caller creates the loop nest (depicted here is `scf.for`, but any other
/// similar op can be used). The iteration should start at zero and proceed with
/// step size 1 to the upper bounds given by
/// `ExtractSliceFromCollapseHelper::getIterationSpaceSizes`. This forms the
/// basis for the "tiling by 1".
///
/// #### Step 2: Transform (%iv0, %iv1) from the index space of %3 to the index
/// space of %0.
///
/// This step is performed by
/// `ExtractSliceFromCollapseHelper::emitLoopNestBody`.
///
/// The induction variables `%iv0` and `%iv1` live in the
/// index space of %2 (for dimensions 0 and 1, respectively). `%lin0` and
/// `%lin1` are the result of inverting or resolve the index space
/// transformation represented by the slice operation, accounting for offset and
/// stride. Subsequently, `%mi0` and `%mi1` are the result of applying the
/// inverse index space transformation represented by `tensor.collapse_shape`.
/// This is accomplished using `affine.delinearize_index`. Note that %iv0
/// and %iv1 now correspond to multi-indices `%mi0:3` and `%mi1:2`.
///
/// #### Step 3: Extract a sub-tile slice from the source.
///
/// This step is also performed by
/// `ExtractSliceFromCollapseHelper::emitLoopNestBody`.
///
/// The indices `%mi0` and `%mi1` are used to extract a slice from %0.  This
/// slice is then collapsed down to match the result rank.
///
/// #### Step 4: Insert sub-tile into the destination
///
/// This step is performed by the caller using the results of
/// `ExtractSliceFromCollapseHelper::emitLoopNestBody`.
///
/// In the above example, the slice insertion parameters are straightforward,
/// but in other possible situations, the slice parameters are more complicated,
/// which is why this helper calculates them for the caller. These other
/// situations correspond to:
/// 1. The presence of linearized dimensions that are not sliced
/// 2. The presence of non-linearized dimensions that are sliced.
class ExtractSliceFromCollapseHelper {};

/// Tries to simplify a `tensor.collapse_shape` operation by inserting a single
/// rank-reducing `tensor.extract_slice` operation. The `extract_slice` op will
/// either take the place of the source, allowing for a new, simpler
/// `collapse_shape` op to replace `op`, or the `collapse_shape` op will be
/// completely replaced by the `extract_slice` result. Either way, `op` is
/// replaced and the new op is returned.
///
/// ### Example:
/// ```
/// %result = tensor.collapse_shape %0 [[0, 1], [2, 3]]
///    : tensor<?x1x30x10xf32> to tensor<?x300xf32>
/// ```
/// can be transformed to
///
/// ```
/// %tmp = tensor.extract_slice %0 [0, 0, 0, 0]
///                         [0, %dim1, 30, 30]
///                         [1, 1, 1 1]
///   : tensor<?x1x30x10xf32> to tensor<?x30x10xf32>
/// %result = tensor.collapse_shape %tmp [[0], [1, 2]]
///   : tensor<?x30x10xf32> to tensor<?x300xf32>
/// ```
///
/// ### Example:
///
/// ```
/// %result = tensor.collapse_shape %1 [[0, 1], [2]]
///    : tensor<?x1x30xf32> to tensor<?x30xf32>
/// ```
/// can be transformed to
/// ```
/// %result = tensor.extract_slice %1 [0, 0, 0]
///                                   [%dim2, 1, 30]
///                                   [1, 1, 1]
///    : tensor<?x1x30xf32> to tensor<?x30xf32>
/// ```
///
/// ### Unsupported cases:
///
/// This transform doesn't yet support reducing the rank of the reassociation
/// indices, which would require inserting a `tensor.expand_shape` op similar to
/// the following example:
/// ```
/// %result = tensor.collapse_shape %0 [[0, 1], [2, 3]]
///    : tensor<1x1x30x10xf32> to tensor<1x300xf32>
/// ```
/// can be transformed to
/// ```
/// %tmp = tensor.extract_slice %0 [0, 0, 0, 0]
///                         [0, 1, 30, 30]
///                         [1, 1, 1 1]
///   : tensor<1x1x30x10xf32> to tensor<30x10xf32>
/// %result0 = tensor.collapse_shape %tmp [[0, 1]]
///   : tensor<30x10xf32> to tensor<300xf32>
/// %result1 = tensor.expand_shape %tmp [[0, 1], [2]] :... tensor<1x300xf32>
/// ```
///
FailureOr<Operation *>
simplifyCollapseShapeWithRankReducingExtractSlice(tensor::CollapseShapeOp op,
                                                  RewriterBase &rewriter);
} // namespace tensor
} // namespace mlir

#endif // MLIR_DIALECT_TENSOR_TRANSFORMS_TRANSFORMUTILS_H