//===- VectorToSCF.cpp - Convert vector to SCF dialect ----------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements lowering of vector transfer operations to SCF. // //===----------------------------------------------------------------------===// #include <numeric> #include <optional> #include <type_traits> #include "mlir/Conversion/VectorToSCF/VectorToSCF.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" #include "mlir/IR/Builders.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "mlir/Transforms/Passes.h" namespace mlir { #define GEN_PASS_DEF_CONVERTVECTORTOSCF #include "mlir/Conversion/Passes.h.inc" } // namespace mlir usingnamespacemlir; TransferReadOp; TransferWriteOp; namespace { /// Attribute name used for labeling transfer ops during progressive lowering. static const char kPassLabel[] = …; /// Return true if this transfer op operates on a source tensor. static bool isTensorOp(VectorTransferOpInterface xferOp) { … } /// Patterns that inherit from this struct have access to /// VectorTransferToSCFOptions. template <typename OpTy> struct VectorToSCFPattern : public OpRewritePattern<OpTy> { … }; /// Given a vector transfer op, calculate which dimension of the `source` /// memref should be unpacked in the next application of TransferOpConversion. /// A return value of std::nullopt indicates a broadcast. template <typename OpTy> static std::optional<int64_t> unpackedDim(OpTy xferOp) { … } /// Compute the permutation map for the new (N-1)-D vector transfer op. This /// map is identical to the current permutation map, but the first result is /// omitted. template <typename OpTy> static AffineMap unpackedPermutationMap(OpBuilder &b, OpTy xferOp) { … } /// Calculate the indices for the new vector transfer op. /// /// E.g.: transfer_read %A[%a, %b, %c, %d] ... : vector<5x4x3xf32> ... /// --> transfer_read %A[%a, %b + iv, %c, %d] ... vector<4x3f32> /// ^^^^^^ /// `iv` is the iteration variable of the (new) surrounding loop. template <typename OpTy> static void getXferIndices(OpBuilder &b, OpTy xferOp, Value iv, SmallVector<Value, 8> &indices) { … } static void maybeYieldValue(OpBuilder &b, Location loc, bool hasRetVal, Value value) { … } /// Generates a boolean Value that is true if the iv-th bit in xferOp's mask /// is set to true. No such check is generated under following circumstances: /// * xferOp does not have a mask. /// * xferOp's mask is not 1D. (In case of (N>1)-D, a subvector of the mask is /// computed and attached to the new transfer op in the pattern.) /// * The to-be-unpacked dim of xferOp is a broadcast. template <typename OpTy> static Value generateMaskCheck(OpBuilder &b, OpTy xferOp, Value iv) { … } /// Helper function TransferOpConversion and TransferOp1dConversion. /// Generate an in-bounds check if the transfer op may go out-of-bounds on the /// specified dimension `dim` with the loop iteration variable `iv`. /// E.g., when unpacking dimension 0 from: /// ``` /// %vec = vector.transfer_read %A[%a, %b] %cst /// : vector<5x4xf32>, memref<?x?xf32> /// ``` /// An if check similar to this will be generated inside the loop: /// ``` /// %d = memref.dim %A, %c0 : memref<?x?xf32> /// if (%a + iv < %d) { /// (in-bounds case) /// } else { /// (out-of-bounds case) /// } /// ``` /// /// If the transfer is 1D and has a mask, this function generates a more complex /// check also accounts for potentially masked out elements. /// /// This function variant returns the value returned by `inBoundsCase` or /// `outOfBoundsCase`. The MLIR type of the return value must be specified in /// `resultTypes`. template <typename OpTy> static Value generateInBoundsCheck( OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim, TypeRange resultTypes, function_ref<Value(OpBuilder &, Location)> inBoundsCase, function_ref<Value(OpBuilder &, Location)> outOfBoundsCase = nullptr) { … } /// In this function variant, `inBoundsCase` and `outOfBoundsCase` do not have /// a return value. Consequently, this function does not have a return value. template <typename OpTy> static void generateInBoundsCheck( OpBuilder &b, OpTy xferOp, Value iv, std::optional<int64_t> dim, function_ref<void(OpBuilder &, Location)> inBoundsCase, function_ref<void(OpBuilder &, Location)> outOfBoundsCase = nullptr) { … } /// Given an ArrayAttr, return a copy where the first element is dropped. static ArrayAttr dropFirstElem(OpBuilder &b, ArrayAttr attr) { … } /// Add the pass label to a vector transfer op if its rank is not the target /// rank. template <typename OpTy> static void maybeApplyPassLabel(OpBuilder &b, OpTy newXferOp, unsigned targetRank) { … } namespace lowering_n_d { /// Helper data structure for data and mask buffers. struct BufferAllocs { … }; // TODO: Parallelism and threadlocal considerations with a ParallelScope trait. static Operation *getAutomaticAllocationScope(Operation *op) { … } /// Allocate temporary buffers for data (vector) and mask (if present). template <typename OpTy> static BufferAllocs allocBuffers(OpBuilder &b, OpTy xferOp) { … } /// Given a MemRefType with VectorType element type, unpack one dimension from /// the VectorType into the MemRefType. /// /// E.g.: memref<9xvector<5x6xf32>> --> memref<9x5xvector<6xf32>> static FailureOr<MemRefType> unpackOneDim(MemRefType type) { … } /// Given a transfer op, find the memref from which the mask is loaded. This /// is similar to Strategy<TransferWriteOp>::getBuffer. template <typename OpTy> static Value getMaskBuffer(OpTy xferOp) { … } /// Codegen strategy, depending on the operation. template <typename OpTy> struct Strategy; /// Code strategy for vector TransferReadOp. template <> struct Strategy<TransferReadOp> { … }; /// Codegen strategy for vector TransferWriteOp. template <> struct Strategy<TransferWriteOp> { … }; template <typename OpTy> LogicalResult checkPrepareXferOp(OpTy xferOp, VectorTransferToSCFOptions options) { … } /// Prepare a TransferReadOp for progressive lowering. /// /// 1. Allocate a temporary buffer. /// 2. Label the TransferReadOp, marking it eligible for progressive lowering. /// 3. Store the result of the TransferReadOp into the temporary buffer. /// 4. Load the result from the temporary buffer and replace all uses of the /// original TransferReadOp with this load. /// /// E.g.: /// ``` /// %vec = vector.transfer_read %A[%a, %b, %c], %cst /// : vector<5x4xf32>, memref<?x?x?xf32> /// ``` /// is rewritten to: /// ``` /// %0 = memref.alloca() : memref<vector<5x4xf32>> /// %1 = vector.transfer_read %A[%a, %b, %c], %cst /// { __vector_to_scf_lowering__ } : vector<5x4xf32>, memref<?x?x?xf32> /// memref.store %1, %0[] : memref<vector<5x4xf32>> /// %vec = memref.load %0[] : memref<vector<5x4xf32>> /// ``` /// /// Note: A second temporary buffer may be allocated for the `mask` operand. struct PrepareTransferReadConversion : public VectorToSCFPattern<TransferReadOp> { … }; /// Prepare a TransferWriteOp for progressive lowering. /// /// 1. Allocate a temporary buffer. /// 2. Store the vector into the buffer. /// 3. Load the vector from the buffer again. /// 4. Use the loaded vector as a TransferWriteOp operand and label the op, /// marking it eligible for progressive lowering via TransferOpConversion. /// /// E.g.: /// ``` /// vector.transfer_write %vec, %A[%a, %b, %c] /// : vector<5x4xf32>, memref<?x?x?xf32> /// ``` /// is rewritten to: /// ``` /// %0 = memref.alloca() : memref<vector<5x4xf32>> /// memref.store %vec, %0[] : memref<vector<5x4xf32>> /// %1 = memref.load %0[] : memref<vector<5x4xf32>> /// vector.transfer_write %1, %A[%a, %b, %c] { __vector_to_scf_lowering__ } /// : vector<5x4xf32>, memref<?x?x?xf32> /// ``` /// /// Note: A second temporary buffer may be allocated for the `mask` operand. struct PrepareTransferWriteConversion : public VectorToSCFPattern<TransferWriteOp> { … }; /// Decompose a n-D PrintOp into a loop of elementary/scalar prints. This allows /// printing both 1D scalable vectors and n-D fixed size vectors. /// /// E.g.: /// ``` /// vector.print %v : vector<[4]xi32> /// ``` /// is rewritten to: /// ``` /// %c0 = arith.constant 0 : index /// %c4 = arith.constant 4 : index /// %c1 = arith.constant 1 : index /// %vscale = vector.vscale /// %length = arith.muli %vscale, %c4 : index /// %lastIndex = arith.subi %length, %c1 : index /// vector.print punctuation <open> /// scf.for %i = %c0 to %length step %c1 { /// %el = vector.extractelement %v[%i : index] : vector<[4]xi32> /// vector.print %el : i32 punctuation <no_punctuation> /// %notLastIndex = arith.cmpi ult, %i, %lastIndex : index /// scf.if %notLastIndex { /// vector.print punctuation <comma> /// } /// } /// vector.print punctuation <close> /// vector.print /// ``` struct DecomposePrintOpConversion : public VectorToSCFPattern<vector::PrintOp> { … }; /// Progressive lowering of vector transfer ops: Unpack one dimension. /// /// 1. Unpack one dimension from the current buffer type and cast the buffer /// to that new type. E.g.: /// ``` /// %vec = memref.load %0[%1] : memref<5xvector<4x3xf32>> /// vector.transfer_write %vec ... /// ``` /// The following cast is generated: /// ``` /// %casted = vector.type_cast %0 /// : memref<5xvector<4x3xf32>> to memref<5x4xvector<3xf32>> /// ``` /// 2. Generate a for loop and rewrite the transfer op according to the /// corresponding Strategy<OpTy>. If the to-be-unpacked dimension can be /// out-of-bounds, generate an if-check and handle both cases separately. /// 3. Clean up according to the corresponding Strategy<OpTy>. /// /// Note: If the transfer op is a TransferWriteOp and operates on a tensor /// source (as opposed to a memref source), then each iteration of the generated /// scf.for loop yields the new tensor value. E.g.: /// ``` /// %result = scf.for i = 0 to 5 { /// %0 = memref.load %buffer[i] : memref<5xvector<4x3xf32>> /// %1 = vector.transfer_write %0, %source[...] /// : vector<4x3xf32>, tensor<5x4x3xf32> /// scf.yield %1 : tensor<5x4x3xf32> /// } /// ``` template <typename OpTy> struct TransferOpConversion : public VectorToSCFPattern<OpTy> { … }; /// Retrieves the dimensions sizes of a mask. Currently supports CreateMaskOp /// and ConstantMaskOp. template <typename VscaleConstantBuilder> static FailureOr<SmallVector<OpFoldResult>> getMaskDimSizes(Value mask, VscaleConstantBuilder &createVscaleMultiple) { … } /// Scalable vector lowering of transfer_write(transpose). This lowering only /// supports rank 2 (scalable) vectors, but can be used in conjunction with /// `UnrollTransferWriteConversion` to support n-D cases. The unroll conversion /// unrolls until the first scalable dimension. /// /// Example: /// /// BEFORE: /// ```mlir /// %transpose = vector.transpose %vec, [1, 0] /// : vector<4x[4]xf32> to vector<[4]x4xf32> /// vector.transfer_write %transpose, %dest[%i, %j] {in_bounds = [true, true]} /// : vector<[4]x4xf32>, memref<?x?xf32> /// ``` /// /// AFTER: /// ```mlir /// %c1 = arith.constant 1 : index /// %c4 = arith.constant 4 : index /// %c0 = arith.constant 0 : index /// %0 = vector.extract %arg0[0] : vector<[4]xf32> from vector<4x[4]xf32> /// %1 = vector.extract %arg0[1] : vector<[4]xf32> from vector<4x[4]xf32> /// %2 = vector.extract %arg0[2] : vector<[4]xf32> from vector<4x[4]xf32> /// %3 = vector.extract %arg0[3] : vector<[4]xf32> from vector<4x[4]xf32> /// %vscale = vector.vscale /// %c4_vscale = arith.muli %vscale, %c4 : index /// scf.for %idx = %c0 to %c4_vscale step %c1 { /// %4 = vector.extract %0[%idx] : f32 from vector<[4]xf32> /// %5 = vector.extract %1[%idx] : f32 from vector<[4]xf32> /// %6 = vector.extract %2[%idx] : f32 from vector<[4]xf32> /// %7 = vector.extract %3[%idx] : f32 from vector<[4]xf32> /// %slice_i = affine.apply #map(%idx)[%i] /// %slice = vector.from_elements %4, %5, %6, %7 : vector<4xf32> /// vector.transfer_write %slice, %arg1[%slice_i, %j] {in_bounds = [true]} /// : vector<4xf32>, memref<?x?xf32> /// } /// ``` struct ScalableTransposeTransferWriteConversion : VectorToSCFPattern<vector::TransferWriteOp> { … }; } // namespace lowering_n_d namespace lowering_n_d_unrolled { /// If the original transfer op has a mask, compute the mask of the new transfer /// op (for the current iteration `i`) and assign it. template <typename OpTy> static void maybeAssignMask(OpBuilder &b, OpTy xferOp, OpTy newXferOp, int64_t i) { … } /// Progressive lowering of vector TransferReadOp with unrolling: Unpack one /// dimension. This is similar to TransferOpConversion<TransferReadOp>, but no /// memref buffer is allocated and the SCF loop is fully unrolled. /// /// ``` /// E.g.: /// ``` /// %vec = vector.transfer_read %A[%a, %b, %c], %padding /// : memref<?x?x?xf32>, vector<5x4xf32> /// ``` /// is rewritten to IR such as (simplified): /// ``` /// %v_init = splat %padding : vector<5x4xf32> /// %tmp0 = vector.transfer_read %A[%a, %b, %c], %padding /// : memref<?x?x?xf32>, vector<4xf32> /// %v0 = vector.insert %tmp0, %v_init[0] : vector<4xf32> into vector<5x4xf32> /// %tmp1 = vector.transfer_read %A[%a, %b + 1, %c], %padding /// : memref<?x?x?xf32>, vector<4xf32> /// %v1 = vector.insert %tmp1, %v0[1] : vector<4xf32> into vector<5x4xf32> /// ... /// %tmp4 = vector.transfer_read %A[%a, %b + 4, %c], %padding /// : memref<?x?x?xf32>, vector<4xf32> /// %vec = vector.insert %tmp1, %v3[4] : vector<4xf32> into vector<5x4xf32> /// ``` /// /// Note: As an optimization, if the result of the original TransferReadOp /// was directly inserted into another vector, no new %v_init vector is created. /// Instead, the new TransferReadOp results are inserted into that vector. struct UnrollTransferReadConversion : public VectorToSCFPattern<TransferReadOp> { … }; /// Progressive lowering of vector TransferWriteOp with unrolling: Unpack one /// dimension. This is similar to TransferOpConversion<TransferWriteOp>, but no /// memref buffer is allocated and the SCF loop is fully unrolled. /// /// ``` /// E.g.: /// ``` /// vector.transfer_write %vec, %A[%a, %b, %c] /// : vector<5x4xf32>, memref<?x?x?xf32> /// ``` /// is rewritten to IR such as (simplified): /// ``` /// %v0 = vector.extract %vec[0] : vector<4xf32> from vector<5x4xf32> /// vector.transfer_write %v0, %A[%a, %b, %c] : vector<4xf32>, memref<...> /// %v1 = vector.extract %vec[1] : vector<4xf32> from vector<5x4xf32> /// vector.transfer_write %v1, %A[%a, %b + 1, %c] : vector<4xf32>, memref<...> /// ... /// %v4 = vector.extract %vec[4] : vector<4xf32> from vector<5x4xf32> /// vector.transfer_write %v4, %A[%a, %b + 4, %c] : vector<4xf32>, memref<...> /// ``` /// /// Note: As an optimization, if the vector of the original TransferWriteOp /// was directly extracted from another vector via an ExtractOp `a`, extract /// the vectors for the newly generated TransferWriteOps from `a`'s input. By /// doing so, `a` may become dead, and the number of ExtractOps generated during /// recursive application of this pattern will be minimal. struct UnrollTransferWriteConversion : public VectorToSCFPattern<TransferWriteOp> { … }; } // namespace lowering_n_d_unrolled namespace lowering_1_d { /// Compute the indices into the memref for the LoadOp/StoreOp generated as /// part of TransferOp1dConversion. Return the memref dimension on which /// the transfer is operating. A return value of std::nullopt indicates a /// broadcast. template <typename OpTy> static std::optional<int64_t> get1dMemrefIndices(OpBuilder &b, OpTy xferOp, Value iv, SmallVector<Value, 8> &memrefIndices) { … } /// Codegen strategy for TransferOp1dConversion, depending on the /// operation. template <typename OpTy> struct Strategy1d; /// Codegen strategy for TransferReadOp. template <> struct Strategy1d<TransferReadOp> { … }; /// Codegen strategy for TransferWriteOp. template <> struct Strategy1d<TransferWriteOp> { … }; /// Lower a 1D vector transfer op to SCF using scalar loads/stores. This is /// necessary in cases where a 1D vector transfer op cannot be lowered into /// vector load/stores due to non-unit strides or broadcasts: /// /// * Transfer dimension is not the last memref dimension /// * Transfer dimension is a broadcast (i.e., scalar load + broadcast) /// * Memref has a layout map with non-unit stride on the last dimension /// /// This pattern generates IR as follows: /// /// 1. Generate a for loop iterating over each vector element. /// 2. Inside the loop, generate a InsertElementOp or ExtractElementOp, /// depending on OpTy. /// /// TODO: In some cases (no masking, etc.), LLVM::MatrixColumnMajorLoadOp /// can be generated instead of TransferOp1dConversion. Add such a pattern /// to ConvertVectorToLLVM. /// /// E.g.: /// ``` /// vector.transfer_write %vec, %A[%a, %b] /// {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} /// : vector<9xf32>, memref<?x?xf32> /// ``` /// Is rewritten to approximately the following pseudo-IR: /// ``` /// for i = 0 to 9 { /// %t = vector.extractelement %vec[i] : vector<9xf32> /// memref.store %t, %arg0[%a + i, %b] : memref<?x?xf32> /// } /// ``` template <typename OpTy> struct TransferOp1dConversion : public VectorToSCFPattern<OpTy> { … }; } // namespace lowering_1_d } // namespace void mlir::populateVectorToSCFConversionPatterns( RewritePatternSet &patterns, const VectorTransferToSCFOptions &options) { … } namespace { struct ConvertVectorToSCFPass : public impl::ConvertVectorToSCFBase<ConvertVectorToSCFPass> { … }; } // namespace std::unique_ptr<Pass> mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) { … }