VectorToSCF.cpp | Explore in Territory

//===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements lowering of vector transfer operations to SCF.
//
//===----------------------------------------------------------------------===//

#include <numeric>
#include <optional>
#include <type_traits>

#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"

namespace mlir {
#define GEN_PASS_DEF_CONVERTVECTORTOSCF
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir

usingnamespacemlir;
TransferReadOp;
TransferWriteOp;

namespace {

/// Attribute name used for labeling transfer ops during progressive lowering.
static const char kPassLabel[] = …;

/// Return true if this transfer op operates on a source tensor.
static bool isTensorOp(VectorTransferOpInterface xferOp) { … }

/// Patterns that inherit from this struct have access to
/// VectorTransferToSCFOptions.
template <typename OpTy>
struct VectorToSCFPattern : public OpRewritePattern<OpTy> { … };

/// Given a vector transfer op, calculate which dimension of the `source`
/// memref should be unpacked in the next application of TransferOpConversion.
/// A return value of std::nullopt indicates a broadcast.
template <typename OpTy>
static std::optional<int64_t> unpackedDim(OpTy xferOp) { … }

/// Compute the permutation map for the new (N-1)-D vector transfer op. This
/// map is identical to the current permutation map, but the first result is
/// omitted.
template <typename OpTy>
static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) { … }

/// Calculate the indices for the new vector transfer op.
///
/// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ...
///       --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32>
///                                 ^^^^^^
///              `iv` is the iteration variable of the (new) surrounding loop.
template <typename OpTy>
static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv,
                           SmallVector<Value, 8> &indices) { … }

static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal,
                            Value value) { … }

/// Generates a boolean Value that is true if the iv-th bit in xferOp's mask
/// is set to true. No such check is generated under following circumstances:
/// * xferOp does not have a mask.
/// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is
///   computed and attached to the new transfer op in the pattern.)
/// * The to-be-unpacked dim of xferOp is a broadcast.
template <typename OpTy>
static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) { … }

/// Helper function TransferOpConversion and TransferOp1dConversion.
/// Generate an in-bounds check if the transfer op may go out-of-bounds on the
/// specified dimension `dim` with the loop iteration variable `iv`.
/// E.g., when unpacking dimension 0 from:
/// ```
/// %vec = vector.transfer_read %A[%a, %b] %cst
///     : vector<5x4xf32>, memref<?x?xf32>
/// ```
/// An if check similar to this will be generated inside the loop:
/// ```
/// %d = memref.dim %A, %c0 : memref<?x?xf32>
/// if (%a + iv < %d) {
///   (in-bounds case)
/// } else {
///   (out-of-bounds case)
/// }
/// ```
///
/// If the transfer is 1D and has a mask, this function generates a more complex
/// check also accounts for potentially masked out elements.
///
/// This function variant returns the value returned by `inBoundsCase` or
/// `outOfBoundsCase`. The MLIR type of the return value must be specified in
/// `resultTypes`.
template <typename OpTy>
static Value generateInBoundsCheck(
    OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim,
    TypeRange resultTypes,
    function_ref<Value(OpBuilder &, Location)> inBoundsCase,
    function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) { … }

/// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have
/// a return value. Consequently, this function does not have a return value.
template <typename OpTy>
static void generateInBoundsCheck(
    OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim,
    function_ref<void(OpBuilder &, Location)> inBoundsCase,
    function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) { … }

/// Given an ArrayAttr, return a copy where the first element is dropped.
static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) { … }

/// Add the pass label to a vector transfer op if its rank is not the target
/// rank.
template <typename OpTy>
static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp,
                                unsigned targetRank) { … }

namespace lowering_n_d {

/// Helper data structure for data and mask buffers.
struct BufferAllocs { … };

// TODO: Parallelism and threadlocal considerations with a ParallelScope trait.
static Operation *getAutomaticAllocationScope(Operation *op) { … }

/// Allocate temporary buffers for data (vector) and mask (if present).
template <typename OpTy>
static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) { … }

/// Given a MemRefType with VectorType element type, unpack one dimension from
/// the VectorType into the MemRefType.
///
/// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>>
static FailureOr<MemRefType> unpackOneDim(MemRefType type) { … }

/// Given a transfer op, find the memref from which the mask is loaded. This
/// is similar to Strategy<TransferWriteOp>::getBuffer.
template <typename OpTy>
static Value getMaskBuffer(OpTy xferOp) { … }

/// Codegen strategy, depending on the operation.
template <typename OpTy>
struct Strategy;

/// Code strategy for vector TransferReadOp.
template <>
struct Strategy<TransferReadOp> { … };

/// Codegen strategy for vector TransferWriteOp.
template <>
struct Strategy<TransferWriteOp> { … };

template <typename OpTy>
LogicalResult checkPrepareXferOp(OpTy xferOp,
                                 VectorTransferToSCFOptions options) { … }

/// Prepare a TransferReadOp for progressive lowering.
///
/// 1. Allocate a temporary buffer.
/// 2. Label the TransferReadOp, marking it eligible for progressive lowering.
/// 3. Store the result of the TransferReadOp into the temporary buffer.
/// 4. Load the result from the temporary buffer and replace all uses of the
///    original TransferReadOp with this load.
///
/// E.g.:
/// ```
/// %vec = vector.transfer_read %A[%a, %b, %c], %cst
///     : vector<5x4xf32>, memref<?x?x?xf32>
/// ```
/// is rewritten to:
/// ```
/// %0 = memref.alloca() : memref<vector<5x4xf32>>
/// %1 = vector.transfer_read %A[%a, %b, %c], %cst
///     { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32>
/// memref.store %1, %0[] : memref<vector<5x4xf32>>
/// %vec = memref.load %0[] : memref<vector<5x4xf32>>
/// ```
///
/// Note: A second temporary buffer may be allocated for the `mask` operand.
struct PrepareTransferReadConversion
    : public VectorToSCFPattern<TransferReadOp> { … };

/// Prepare a TransferWriteOp for progressive lowering.
///
/// 1. Allocate a temporary buffer.
/// 2. Store the vector into the buffer.
/// 3. Load the vector from the buffer again.
/// 4. Use the loaded vector as a TransferWriteOp operand and label the op,
///    marking it eligible for progressive lowering via TransferOpConversion.
///
/// E.g.:
/// ```
/// vector.transfer_write %vec, %A[%a, %b, %c]
///     : vector<5x4xf32>, memref<?x?x?xf32>
/// ```
/// is rewritten to:
/// ```
/// %0 = memref.alloca() : memref<vector<5x4xf32>>
/// memref.store %vec, %0[] : memref<vector<5x4xf32>>
/// %1 = memref.load %0[] : memref<vector<5x4xf32>>
/// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ }
///     : vector<5x4xf32>, memref<?x?x?xf32>
/// ```
///
/// Note: A second temporary buffer may be allocated for the `mask` operand.
struct PrepareTransferWriteConversion
    : public VectorToSCFPattern<TransferWriteOp> { … };

/// Decompose a n-D PrintOp into a loop of elementary/scalar prints. This allows
/// printing both 1D scalable vectors and n-D fixed size vectors.
///
/// E.g.:
/// ```
/// vector.print %v : vector<[4]xi32>
/// ```
/// is rewritten to:
/// ```
/// %c0 = arith.constant 0 : index
/// %c4 = arith.constant 4 : index
/// %c1 = arith.constant 1 : index
/// %vscale = vector.vscale
/// %length = arith.muli %vscale, %c4 : index
/// %lastIndex = arith.subi %length, %c1 : index
/// vector.print punctuation <open>
/// scf.for %i = %c0 to %length step %c1 {
///   %el = vector.extractelement %v[%i : index] : vector<[4]xi32>
///   vector.print %el : i32 punctuation <no_punctuation>
///   %notLastIndex = arith.cmpi ult, %i, %lastIndex : index
///   scf.if %notLastIndex {
///     vector.print punctuation <comma>
///   }
/// }
/// vector.print punctuation <close>
/// vector.print
/// ```
struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> { … };

/// Progressive lowering of vector transfer ops: Unpack one dimension.
///
/// 1. Unpack one dimension from the current buffer type and cast the buffer
///    to that new type. E.g.:
///    ```
///    %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>>
///    vector.transfer_write %vec ...
///    ```
///    The following cast is generated:
///    ```
///    %casted = vector.type_cast %0
///        : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>>
///    ```
/// 2. Generate a for loop and rewrite the transfer op according to the
///    corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be
///    out-of-bounds, generate an if-check and handle both cases separately.
/// 3. Clean up according to the corresponding Strategy<OpTy>.
///
/// Note: If the transfer op is a TransferWriteOp and operates on a tensor
/// source (as opposed to a memref source), then each iteration of the generated
/// scf.for loop yields the new tensor value. E.g.:
/// ```
/// %result = scf.for i = 0 to 5 {
///   %0 = memref.load %buffer[i] : memref<5xvector<4x3xf32>>
///   %1 = vector.transfer_write %0, %source[...]
///       : vector<4x3xf32>, tensor<5x4x3xf32>
///   scf.yield %1 : tensor<5x4x3xf32>
/// }
/// ```
template <typename OpTy>
struct TransferOpConversion : public VectorToSCFPattern<OpTy> { … };

/// Retrieves the dimensions sizes of a mask. Currently supports CreateMaskOp
/// and ConstantMaskOp.
template <typename VscaleConstantBuilder>
static FailureOr<SmallVector<OpFoldResult>>
getMaskDimSizes(Value mask, VscaleConstantBuilder &createVscaleMultiple) { … }

/// Scalable vector lowering of transfer_write(transpose). This lowering only
/// supports rank 2 (scalable) vectors, but can be used in conjunction with
/// `UnrollTransferWriteConversion` to support n-D cases. The unroll conversion
/// unrolls until the first scalable dimension.
///
/// Example:
///
/// BEFORE:
/// ```mlir
/// %transpose = vector.transpose %vec, [1, 0]
///    : vector<4x[4]xf32> to vector<[4]x4xf32>
/// vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]}
///    : vector<[4]x4xf32>,  memref<?x?xf32>
/// ```
///
/// AFTER:
/// ```mlir
/// %c1 = arith.constant 1 : index
/// %c4 = arith.constant 4 : index
/// %c0 = arith.constant 0 : index
/// %0 = vector.extract %arg0[0] : vector<[4]xf32> from vector<4x[4]xf32>
/// %1 = vector.extract %arg0[1] : vector<[4]xf32> from vector<4x[4]xf32>
/// %2 = vector.extract %arg0[2] : vector<[4]xf32> from vector<4x[4]xf32>
/// %3 = vector.extract %arg0[3] : vector<[4]xf32> from vector<4x[4]xf32>
/// %vscale = vector.vscale
/// %c4_vscale = arith.muli %vscale, %c4 : index
/// scf.for %idx = %c0 to %c4_vscale step %c1 {
///   %4 = vector.extract %0[%idx] : f32 from vector<[4]xf32>
///   %5 = vector.extract %1[%idx] : f32 from vector<[4]xf32>
///   %6 = vector.extract %2[%idx] : f32 from vector<[4]xf32>
///   %7 = vector.extract %3[%idx] : f32 from vector<[4]xf32>
///   %slice_i = affine.apply #map(%idx)[%i]
///   %slice = vector.from_elements %4, %5, %6, %7 : vector<4xf32>
///   vector.transfer_write %slice, %arg1[%slice_i, %j] {in_bounds = [true]}
///     : vector<4xf32>, memref<?x?xf32>
/// }
/// ```
struct ScalableTransposeTransferWriteConversion
    : VectorToSCFPattern<vector::TransferWriteOp> { … };

} // namespace lowering_n_d

namespace lowering_n_d_unrolled {

/// If the original transfer op has a mask, compute the mask of the new transfer
/// op (for the current iteration `i`) and assign it.
template <typename OpTy>
static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp,
                            int64_t i) { … }

/// Progressive lowering of vector TransferReadOp with unrolling: Unpack one
/// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no
/// memref buffer is allocated and the SCF loop is fully unrolled.
///
/// ```
/// E.g.:
/// ```
/// %vec = vector.transfer_read %A[%a, %b, %c], %padding
///     : memref<?x?x?xf32>, vector<5x4xf32>
/// ```
/// is rewritten to IR such as (simplified):
/// ```
/// %v_init = splat %padding : vector<5x4xf32>
/// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding
///     : memref<?x?x?xf32>, vector<4xf32>
/// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32>
/// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding
///     : memref<?x?x?xf32>, vector<4xf32>
/// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32>
/// ...
/// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding
///     : memref<?x?x?xf32>, vector<4xf32>
/// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32>
/// ```
///
/// Note: As an optimization, if the result of the original TransferReadOp
/// was directly inserted into another vector, no new %v_init vector is created.
/// Instead, the new TransferReadOp results are inserted into that vector.
struct UnrollTransferReadConversion
    : public VectorToSCFPattern<TransferReadOp> { … };

/// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one
/// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no
/// memref buffer is allocated and the SCF loop is fully unrolled.
///
/// ```
/// E.g.:
/// ```
/// vector.transfer_write %vec, %A[%a, %b, %c]
///     : vector<5x4xf32>, memref<?x?x?xf32>
/// ```
/// is rewritten to IR such as (simplified):
/// ```
/// %v0 = vector.extract %vec[0] : vector<4xf32> from vector<5x4xf32>
/// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...>
/// %v1 = vector.extract %vec[1] : vector<4xf32> from vector<5x4xf32>
/// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...>
/// ...
/// %v4 = vector.extract %vec[4] : vector<4xf32> from vector<5x4xf32>
/// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...>
/// ```
///
/// Note: As an optimization, if the vector of the original TransferWriteOp
/// was directly extracted from another vector via an ExtractOp `a`, extract
/// the vectors for the newly generated TransferWriteOps from `a`'s input. By
/// doing so, `a` may become dead, and the number of ExtractOps generated during
/// recursive application of this pattern will be minimal.
struct UnrollTransferWriteConversion
    : public VectorToSCFPattern<TransferWriteOp> { … };

} // namespace lowering_n_d_unrolled

namespace lowering_1_d {

/// Compute the indices into the memref for the LoadOp/StoreOp generated as
/// part of TransferOp1dConversion. Return the memref dimension on which
/// the transfer is operating. A return value of std::nullopt indicates a
/// broadcast.
template <typename OpTy>
static std::optional<int64_t>
get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv,
                   SmallVector<Value, 8> &memrefIndices) { … }

/// Codegen strategy for TransferOp1dConversion, depending on the
/// operation.
template <typename OpTy>
struct Strategy1d;

/// Codegen strategy for TransferReadOp.
template <>
struct Strategy1d<TransferReadOp> { … };

/// Codegen strategy for TransferWriteOp.
template <>
struct Strategy1d<TransferWriteOp> { … };

/// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is
/// necessary in cases where a 1D vector transfer op cannot be lowered into
/// vector load/stores due to non-unit strides or broadcasts:
///
/// * Transfer dimension is not the last memref dimension
/// * Transfer dimension is a broadcast (i.e., scalar load + broadcast)
/// * Memref has a layout map with non-unit stride on the last dimension
///
/// This pattern generates IR as follows:
///
/// 1. Generate a for loop iterating over each vector element.
/// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp,
///    depending on OpTy.
///
/// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp
///       can be generated instead of TransferOp1dConversion. Add such a pattern
///       to ConvertVectorToLLVM.
///
/// E.g.:
/// ```
/// vector.transfer_write %vec, %A[%a, %b]
///    {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]}
///    : vector<9xf32>, memref<?x?xf32>
/// ```
/// Is rewritten to approximately the following pseudo-IR:
/// ```
/// for i = 0 to 9 {
///   %t = vector.extractelement %vec[i] : vector<9xf32>
///   memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32>
/// }
/// ```
template <typename OpTy>
struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> { … };

} // namespace lowering_1_d
} // namespace

void mlir::populateVectorToSCFConversionPatterns(
    RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) { … }

namespace {

struct ConvertVectorToSCFPass
    : public impl::ConvertVectorToSCFBase<ConvertVectorToSCFPass> { … };

} // namespace

std::unique_ptr<Pass>
mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { … }
llvm/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp