//===- Vectorization.cpp - Implementation of linalg Vectorization ---------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the linalg dialect Vectorization transformations. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <optional> #include <type_traits> usingnamespacemlir; usingnamespacemlir::linalg; #define DEBUG_TYPE … #define DBGS() … #define LDBG(X) … /// Try to vectorize `convOp` as a convolution. static FailureOr<Operation *> vectorizeConvolution(RewriterBase &rewriter, LinalgOp convOp, ArrayRef<int64_t> inputVecSizes = {}, ArrayRef<bool> inputVecScalableFlags = {}, bool flatten1DDepthwiseConv = false); /// Return the unique instance of OpType in `block` if it is indeed unique. /// Return null if none or more than 1 instances exist. template <typename OpType> static OpType getSingleOpOfType(Block &block) { … } /// Helper function to extract the input slices after filter is unrolled along /// kw. static SmallVector<Value> extractConvInputSlices(RewriterBase &rewriter, Location loc, Value input, int64_t nSize, int64_t wSize, int64_t cSize, int64_t kwSize, int strideW, int dilationW, int64_t wSizeStep, bool isSingleChanneled) { … } /// Helper function to extract the filter slices after filter is unrolled along /// kw. static SmallVector<Value> extractConvFilterSlices(RewriterBase &rewriter, Location loc, Value filter, int64_t kwSize) { … } /// Helper function to extract the result slices after filter is unrolled along /// kw. static SmallVector<Value> extractConvResultSlices(RewriterBase &rewriter, Location loc, Value res, int64_t nSize, int64_t wSize, int64_t fSize, int64_t wSizeStep, bool isSingleChanneled) { … } /// Helper function to insert the computed result slices. static Value insertConvResultSlices(RewriterBase &rewriter, Location loc, Value res, int64_t wSize, int64_t wSizeStep, SmallVectorImpl<Value> &resVals, bool isSingleChanneled) { … } /// Contains the vectorization state and related methods used across the /// vectorization process of a given operation. struct VectorizationState { … }; LogicalResult VectorizationState::precomputeIterSpaceValueSizes(RewriterBase &rewriter, LinalgOp linalgOp) { … } /// Initializes the vectorization state, including the computation of the /// canonical vector shape for vectorization. // TODO: Move this to the constructor when we can remove the failure cases. LogicalResult VectorizationState::initState(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef<int64_t> inputVectorSizes, ArrayRef<bool> inputScalableVecDims) { … } /// Create or retrieve an existing mask value to mask `opToMask` in the /// canonical vector iteration space. If `maybeMaskingMap` the mask is permuted /// using that permutation map. If a new mask is created, it will be cached for /// future users. Value VectorizationState::getOrCreateMaskFor( RewriterBase &rewriter, Operation *opToMask, LinalgOp linalgOp, std::optional<AffineMap> maybeMaskingMap) { … } Operation * VectorizationState::maskOperation(RewriterBase &rewriter, Operation *opToMask, LinalgOp linalgOp, std::optional<AffineMap> maybeIndexingMap) { … } /// Given an indexing `map` coming from a LinalgOp indexing, restricted to a /// projectedPermutation, compress the unused dimensions to serve as a /// permutation_map for a vector transfer operation. /// For example, given a linalg op such as: /// /// ``` /// %0 = linalg.generic { /// indexing_maps = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, d2)>, /// indexing_maps = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3)> /// } /// ins(%0 : tensor<2x3x4xf32>) /// outs(%1 : tensor<5x6xf32>) /// ``` /// /// the iteration domain size of the linalg op is 3x5x4x6x2. The first affine /// map is reindexed to `affine_map<(d0, d1, d2) -> (d2, d0, d1)>`, the second /// affine map is reindexed to `affine_map<(d0, d1) -> (d0, d1)>`. static AffineMap reindexIndexingMap(AffineMap map) { … } /// Helper enum to represent conv1d input traversal order. enum class Conv1DOpOrder { … }; /// Helper data structure to represent the result of vectorization. /// In certain specific cases, like terminators, we do not want to propagate/ enum VectorizationStatus { … }; struct VectorizationResult { … }; std::optional<vector::CombiningKind> mlir::linalg::getCombinerOpKind(Operation *combinerOp) { … } /// Check whether `outputOperand` is a reduction with a single combiner /// operation. Return the combiner operation of the reduction. Return /// nullptr otherwise. Multiple reduction operations would impose an /// ordering between reduction dimensions and is currently unsupported in /// Linalg. This limitation is motivated by the fact that e.g. min(max(X)) != /// max(min(X)) // TODO: use in LinalgOp verification, there is a circular dependency atm. static Operation *matchLinalgReduction(OpOperand *outputOperand) { … } /// Broadcast `value` to a vector of `shape` if possible. Return value /// otherwise. static Value broadcastIfNeeded(OpBuilder &b, Value value, Type dstType) { … } /// Create MultiDimReductionOp to compute the reduction for `reductionOp`. This /// assumes that `reductionOp` has two operands and one of them is the reduction /// initial value.buildMultiDimReduce // Note: this is a true builder that notifies the OpBuilder listener. // TODO: Consider moving as a static helper on the ReduceOp. static Operation *buildMultiDimReduce(OpBuilder &b, Operation *reduceOp, Value valueToReduce, Value acc, ArrayRef<bool> dimsToMask) { … } static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) { … } /// Check if `op` is a linalg.reduce or a linalg.generic that has at least one /// reduction iterator. static bool hasReductionIterator(LinalgOp &op) { … } /// Build a vector.transfer_write of `value` into `outputOperand` at indices set /// to all `0`; where `outputOperand` is an output operand of the LinalgOp /// currently being vectorized. If `dest` has null rank, build an memref.store. /// Return the produced value or null if no value is produced. // Note: this is a true builder that notifies the OpBuilder listener. // TODO: Consider moving as a static helper on the ReduceOp. static Value buildVectorWrite(RewriterBase &rewriter, Value value, OpOperand *outputOperand, VectorizationState &state) { … } // Custom vectorization precondition function type. This is intented to be used // with CustomVectorizationHook. Returns success if the corresponding custom // hook can vectorize the op. CustomVectorizationPrecondition; // Custom vectorization function type. Produce a vector form of Operation* // assuming all its vectorized operands are already in the IRMapping. // Return nullptr if the Operation cannot be vectorized. CustomVectorizationHook; /// Helper function to vectorize the terminator of a `linalgOp`. New result /// vector values are appended to `newResults`. Return /// VectorizationStatus::NoReplace to signal the vectorization algorithm that it /// should not try to map produced operations and instead return the results /// using the `newResults` vector making them available to the vectorization /// algorithm for RAUW. This function is meant to be used as a /// CustomVectorizationHook. static VectorizationResult vectorizeLinalgYield(RewriterBase &rewriter, Operation *op, const IRMapping &bvm, VectorizationState &state, LinalgOp linalgOp, SmallVectorImpl<Value> &newResults) { … } /// Helper function to vectorize the index operations of a `linalgOp`. Return /// VectorizationStatus::NewOp to signal the vectorization algorithm that it /// should map the produced operations. This function is meant to be used as a /// CustomVectorizationHook. static VectorizationResult vectorizeLinalgIndex(RewriterBase &rewriter, VectorizationState &state, Operation *op, LinalgOp linalgOp) { … } /// Helper function to check if the tensor.extract can be vectorized by the /// custom hook vectorizeTensorExtract. static LogicalResult tensorExtractVectorizationPrecondition(Operation *op, bool vectorizeNDExtract) { … } /// Calculates the offsets (`$index_vec`) for `vector.gather` operations /// generated from `tensor.extract`. The offset is calculated as follows /// (example using scalar values): /// /// offset = extractOp.indices[0] /// for (i = 1; i < numIndices; i++) /// offset = extractOp.dimSize[i] * offset + extractOp.indices[i]; /// /// For tensor<45 x 80 x 15 x f32> and index [1, 2, 3], this leads to: /// offset = ( ( 1 ) * 80 + 2 ) * 15 + 3 static Value calculateGatherOffset(RewriterBase &rewriter, VectorizationState &state, tensor::ExtractOp extractOp, const IRMapping &bvm) { … } enum VectorMemoryAccessKind { … }; /// Find the index of the trailing non-unit dim in linalgOp. This hook is used /// when checking whether `tensor.extract` Op (within a `linalg.generic` Op) /// represents a contiguous load operation. /// /// Note that when calling this hook, it is assumed that the output vector is /// effectively 1D. Other cases (i.e. reading n-D vectors) should've been /// labelled as a gather load before entering this method. /// /// Following on from the above, it is assumed that: /// * for statically shaped loops, when no masks are used, only one dim is != /// 1 (that's what the shape of the output vector is based on). /// * for dynamically shaped loops, there might be more non-unit dims /// as the output vector type is user-specified. /// /// TODO: Statically shaped loops + vector masking static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) { … } /// Checks whether `val` can be used for calculating a loop invariant index. static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val, VectorType resType) { … } /// Check whether `val` could be used for calculating the trailing index for a /// contiguous load operation. /// /// There are currently 3 types of values that are allowed here: /// 1. loop-invariant values, /// 2. values that increment by 1 with every loop iteration, /// 3. results of basic arithmetic operations (linear and continuous) /// involving 1., 2. and 3. /// This method returns True if indeed only such values are used in calculating /// `val.` /// /// Additionally, the trailing index for a contiguous load operation should /// increment by 1 with every loop iteration, i.e. be based on: /// * `linalg.index <dim>` , /// where <dim> is the trailing non-unit dim of the iteration space (this way, /// `linalg.index <dim>` increments by 1 with every loop iteration). /// `foundIndexOp` is updated to `true` when such Op is found. static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val, bool &foundIndexOp, VectorType resType) { … } /// Infer the memory access pattern for the input ExtractOp /// /// Based on the ExtratOp result shape and the access indices, decides whether /// this Op corresponds to a contiguous load (including a broadcast of a scalar) /// or a gather load. When analysing the ExtractOp indices (to identify /// contiguous laods), this method looks for "loop" invariant indices (e.g. /// block arguments) and indices that change linearly (e.g. via `linalg.index` /// Op). /// /// Note that it is always safe to use gather load operations for contiguous /// loads (albeit slow), but not vice-versa. When in doubt, bail out and assume /// that `extractOp` is a gather load. static VectorMemoryAccessKind getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp, LinalgOp &linalgOp, VectorType resType) { … } /// Helper function to vectorize the tensor.extract operations. Returns /// VectorizationStatus::NewOp to signal the vectorization algorithm that it /// should map the produced operations. This function is meant to be used as a /// CustomVectorizationHook. static VectorizationResult vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state, Operation *op, LinalgOp linalgOp, const IRMapping &bvm) { … } /// Emit reduction operations if the shapes of the value to reduce is different /// that the result shape. // Note: this is a true builder that notifies the OpBuilder listener. // TODO: Consider moving as a static helper on the ReduceOp. static Operation *reduceIfNeeded(OpBuilder &b, LinalgOp linalgOp, Operation *op, Value reduceValue, Value initialValue, const IRMapping &bvm) { … } /// Generic vectorization for a single operation `op`, given already vectorized /// operands carried by `bvm`. Vectorization occurs as follows: /// 1. Try to apply any of the `customVectorizationHooks` and return its /// result on success. /// 2. Clone any constant in the current scope without vectorization: each /// consumer of the constant will later determine the shape to which the /// constant needs to be broadcast to. /// 3. Fail on any remaining non `ElementwiseMappable` op. It is the purpose /// of the `customVectorizationHooks` to cover such cases. /// 4. Clone `op` in vector form to a vector of shape prescribed by the first /// operand of maximal rank. Other operands have smaller rank and are /// broadcast accordingly. It is assumed this broadcast is always legal, /// otherwise, it means one of the `customVectorizationHooks` is incorrect. /// /// This function assumes all operands of `op` have been vectorized and are in /// the `bvm` mapping. As a consequence, this function is meant to be called on /// a topologically-sorted list of ops. /// This function does not update `bvm` but returns a VectorizationStatus that /// instructs the caller what `bvm` update needs to occur. static VectorizationResult vectorizeOneOp(RewriterBase &rewriter, VectorizationState &state, LinalgOp linalgOp, Operation *op, const IRMapping &bvm, ArrayRef<CustomVectorizationHook> customVectorizationHooks) { … } /// Generic vectorization function that rewrites the body of a `linalgOp` into /// vector form. Generic vectorization proceeds as follows: /// 1. Verify the `linalgOp` has one non-empty region. /// 2. Values defined above the region are mapped to themselves and will be /// broadcasted on a per-need basis by their consumers. /// 3. Each region argument is vectorized into a vector.transfer_read (or 0-d /// load). /// TODO: Reuse opportunities for RAR dependencies. /// 4a. Register CustomVectorizationHook for YieldOp to capture the results. /// 4rewriter. Register CustomVectorizationHook for IndexOp to access the /// iteration indices. /// 5. Iteratively call vectorizeOneOp on the region operations. /// /// When `broadcastToMaximalCommonShape` is set to true, eager broadcasting is /// performed to the maximal common vector size implied by the `linalgOp` /// iteration space. This eager broadcasting is introduced in the /// permutation_map of the vector.transfer_read operations. The eager /// broadcasting makes it trivial to detrmine where broadcast, transposes and /// reductions should occur, without any bookkeeping. The tradeoff is that, in /// the absence of good canonicalizations, the amount of work increases. /// This is not deemed a problem as we expect canonicalizations and foldings to /// aggressively clean up the useless work. static LogicalResult vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, LinalgOp linalgOp, SmallVectorImpl<Value> &newResults) { … } /// Given a tensor::PackOp, return the `dest` shape before any packing /// permutations. static SmallVector<int64_t> getTiledPackShape(tensor::PackOp packOp, ArrayRef<int64_t> destShape) { … } /// Given an input, the mixed destSizes, and the vector sizes for vectorization, /// create an empty destination tensor and create a TransferWriteOp from the /// input to the empty tensor. If the destination shape is not the same as the /// inputVectorSizes for the first rank(inputVectorSizes) dims, then create a /// mask for the write. If `useInBoundsInsteadOfMasking` is set, then update the /// inBounds attribute of the transfer write op instead of masking. static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value input, SmallVector<OpFoldResult> destSizes, ArrayRef<int64_t> inputVectorSizes, bool useInBoundsInsteadOfMasking) { … } /// Vectorize tensor::PackOp with (1) static innerTiles (2) constant /// padding value and (3) input vector sizes into: /// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds /// As in the following example: /// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2] /// into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32> /// /// This pack would be vectorized to: /// /// %load = vector.mask %mask { /// vector.transfer_read %arg0[%c0, %c0, %c0], %cst /// {in_bounds = [true, true, true]} : /// tensor<32x7x16xf32>, vector<32x8x16xf32> /// } : vector<32x8x16xi1> -> vector<32x8x16xf32> /// %shape_cast = vector.shape_cast %load : vector<32x8x16xf32> /// to vector<32x4x2x1x16xf32> /// %transpose = vector.transpose %shape_cast, [0, 1, 3, 4, 2] /// : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> /// %write = vector.transfer_write %transpose, /// %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0] /// {in_bounds = [true, true, true, true, true]} /// : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> /// /// If the (3) input vector sizes are not provided, the vector sizes are /// determined by the result tensor shape. Also, we update the inBounds /// attribute instead of masking. static LogicalResult vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, ArrayRef<int64_t> inputVectorSizes, SmallVectorImpl<Value> &newResults) { … } /// Vectorize a `tensor::UnPackOp` to these 4 Ops: /// Vector::TransferReadOp - Reads a vector from the source tensor /// vector::TransposeOp - Transpose the Source tensor /// ShapeCastOp - Reshape the data based on the target. /// vector::TransferWriteOp. - Write the result vector back to the destination /// tensor. /// If the vector sizes are not provided: /// * the vector sizes are determined by the input operand and attributes, /// * update the inBounds attribute instead of masking. static LogicalResult vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, ArrayRef<int64_t> inputVectorSizes, SmallVectorImpl<Value> &newResults) { … } /// Vectorize a `padOp` with (1) static result type, (2) constant padding value /// and (3) all-zero lowPad to /// `transfer_write_in_bounds(transfer_read_masked(pad_source, pad_value))`. static LogicalResult vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp, ArrayRef<int64_t> inputVectorSizes, SmallVectorImpl<Value> &newResults) { … } // TODO: probably need some extra checks for reduction followed by consumer // ops that may not commute (e.g. linear reduction + non-linear instructions). static LogicalResult reductionPreconditions(LinalgOp op) { … } static LogicalResult vectorizeDynamicConvOpPrecondition(linalg::LinalgOp conv, bool flatten1DDepthwiseConv) { … } static LogicalResult vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, bool flatten1DDepthwiseConv) { … } /// Need to check if the inner-tiles are static/constant. static LogicalResult vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp, ArrayRef<int64_t> inputVectorSizes) { … } static LogicalResult vectorizeLinalgOpPrecondition( LinalgOp linalgOp, ArrayRef<int64_t> inputVectorSizes, bool vectorizeNDExtract, bool flatten1DDepthwiseConv) { … } static LogicalResult vectorizePackOpPrecondition(tensor::PackOp packOp, ArrayRef<int64_t> inputVectorSizes) { … } static LogicalResult vectorizePadOpPrecondition(tensor::PadOp padOp, ArrayRef<int64_t> inputVectorSizes) { … } /// Preconditions for scalable vectors. This is quite restrictive - it models /// the fact that in practice we would only make selected dimensions scalable. static LogicalResult vectorizeScalableVectorPrecondition(Operation *op, ArrayRef<int64_t> inputVectorSizes, ArrayRef<bool> inputScalableVecDims) { … } LogicalResult mlir::linalg::vectorizeOpPrecondition( Operation *op, ArrayRef<int64_t> inputVectorSizes, ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract, bool flatten1DDepthwiseConv) { … } /// Converts affine.apply Ops to arithmetic operations. static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) { … } bool mlir::linalg::hasVectorizationImpl(Operation *op) { … } /// Emit a suitable vector form for an operation. If provided, /// `inputVectorSizes` are used to vectorize this operation. /// `inputVectorSizes` must match the rank of the iteration space of the /// operation and the input vector sizes must be greater than or equal to /// their counterpart iteration space sizes, if static. `inputVectorShapes` /// also allows the vectorization of operations with dynamic shapes. LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, ArrayRef<int64_t> inputVectorSizes, ArrayRef<bool> inputScalableVecDims, bool vectorizeNDExtract, bool flatten1DDepthwiseConv) { … } LogicalResult mlir::linalg::vectorizeCopy(RewriterBase &rewriter, memref::CopyOp copyOp) { … } //----------------------------------------------------------------------------// // Misc. vectorization patterns. //----------------------------------------------------------------------------// /// Helper function that retrieves the value of an IntegerAttr. static int64_t getIntFromAttr(Attribute attr) { … } /// Given an ArrayRef of OpFoldResults, return a vector of Values. /// IntegerAttrs are converted to ConstantIndexOps. Other attribute types are /// not supported. static SmallVector<Value> ofrToIndexValues(RewriterBase &rewriter, Location loc, ArrayRef<OpFoldResult> ofrs) { … } /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and /// InsertSliceOp. For now, only constant padding values are supported. /// If there is enough static type information, TransferReadOps and /// TransferWriteOps may be generated instead of InsertSliceOps. struct GenericPadOpVectorizationPattern : public GeneralizePadOpPattern { … }; /// Base pattern for rewriting tensor::PadOps whose result is consumed by a /// given operation type OpTy. template <typename OpTy> struct VectorizePadOpUserPattern : public OpRewritePattern<tensor::PadOp> { … }; /// Rewrite use of tensor::PadOp result in TransferReadOp. E.g.: /// ``` /// %0 = tensor.pad %src ... : tensor<?x?xf32> to tensor<17x5xf32> /// %r = vector.transfer_read %0[%c0, %c0], %cst /// {in_bounds = [true, true]} : tensor<17x5xf32>, vector<17x5xf32> /// ``` /// is rewritten to: /// ``` /// %r = vector.transfer_read %src[%c0, %c0], %padding /// {in_bounds = [true, true]} /// : tensor<?x?xf32>, vector<17x5xf32> /// ``` /// Note: By restricting this pattern to in-bounds TransferReadOps, we can be /// sure that the original padding value %cst was never used. /// /// This rewrite is possible if: /// - `xferOp` has no out-of-bounds dims or mask. /// - Low padding is static 0. /// - Single, scalar padding value. struct PadOpVectorizationWithTransferReadPattern : public VectorizePadOpUserPattern<vector::TransferReadOp> { … }; /// Rewrite use of tensor::PadOp result in TransferWriteOp. /// This pattern rewrites TransferWriteOps that write to a padded tensor /// value, where the same amount of padding is immediately removed again after /// the write. In such cases, the TransferWriteOp can write to the non-padded /// tensor value and apply out-of-bounds masking. E.g.: /// ``` /// %0 = tensor.extract_slice ...[...] [%s0, %s1] [1, 1] /// : tensor<...> to tensor<?x?xf32> /// %1 = tensor.pad %0 ... : tensor<?x?xf32> to tensor<17x5xf32> /// %2 = vector.transfer_write %vec, %1[...] /// : vector<17x5xf32>, tensor<17x5xf32> /// %r = tensor.extract_slice %2[0, 0] [%s0, %s1] [1, 1] /// : tensor<17x5xf32> to tensor<?x?xf32> /// ``` /// is rewritten to: /// ``` /// %0 = tensor.extract_slice ...[...] [%s0, %s1] [1, 1] /// : tensor<...> to tensor<?x?xf32> /// %r = vector.transfer_write %vec, %0[...] : vector<17x5xf32>, /// tensor<?x?xf32> /// ``` /// Note: It is important that the ExtractSliceOp %r resizes the result of the /// TransferWriteOp to the same size as the input of the TensorPadOp (or an /// even smaller size). Otherwise, %r's new (dynamic) dimensions would differ /// from %r's old dimensions. /// /// This rewrite is possible if: /// - Low padding is static 0. /// - `xferOp` has exactly one use, which is an ExtractSliceOp. This /// ExtractSliceOp trims the same amount of padding that was added /// beforehand. /// - Single, scalar padding value. struct PadOpVectorizationWithTransferWritePattern : public VectorizePadOpUserPattern<vector::TransferWriteOp> { … }; /// Rewrite use of tensor::PadOp result in InsertSliceOp. E.g.: /// ``` /// %0 = tensor.pad %src ... : tensor<?x?xf32> to tensor<17x5xf32> /// %r = tensor.insert_slice %0 /// into %dest[%a, %b, 0, 0] [1, 1, 17, 5] [1, 1, 1, 1] /// : tensor<17x5xf32> into tensor<?x?x17x5xf32> /// ``` /// is rewritten to: /// ``` /// %0 = vector.transfer_read %src[%c0, %c0], %padding /// : tensor<?x?xf32>, vector<17x5xf32> /// %r = vector.transfer_write %0, %dest[%a, %b, %c0, %c0] /// {in_bounds = [true, true]} : vector<17x5xf32>, tensor<?x?x17x5xf32> /// ``` /// /// This rewrite is possible if: /// - Low padding is static 0. /// - `padOp` result shape is static. /// - The entire padded tensor is inserted. /// (Implies that sizes of `insertOp` are all static.) /// - Only unit strides in `insertOp`. /// - Single, scalar padding value. /// - `padOp` result not used as destination. struct PadOpVectorizationWithInsertSlicePattern : public VectorizePadOpUserPattern<tensor::InsertSliceOp> { … }; void mlir::linalg::populatePadOpVectorizationPatterns( RewritePatternSet &patterns, PatternBenefit baseBenefit) { … } //----------------------------------------------------------------------------// // Forwarding patterns //----------------------------------------------------------------------------// /// Check whether there is any interleaved use of any `values` between /// `firstOp` and `secondOp`. Conservatively return `true` if any op or value /// is in a different block. static bool mayExistInterleavedUses(Operation *firstOp, Operation *secondOp, ValueRange values) { … } /// Return the unique subview use of `v` if it is indeed unique, null /// otherwise. static memref::SubViewOp getSubViewUseIfUnique(Value v) { … } /// TODO: use interfaces, side-effects and aliasing analysis as appropriate, /// when available. LogicalResult LinalgCopyVTRForwardingPattern::matchAndRewrite( vector::TransferReadOp xferOp, PatternRewriter &rewriter) const { … } /// TODO: use interfaces, side-effects and aliasing analysis as appropriate, /// when available. LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite( vector::TransferWriteOp xferOp, PatternRewriter &rewriter) const { … } //===----------------------------------------------------------------------===// // Convolution vectorization patterns //===----------------------------------------------------------------------===// template <int N> static void bindShapeDims(ShapedType shapedType) { … } template <int N, typename IntTy, typename... IntTy2> static void bindShapeDims(ShapedType shapedType, IntTy &val, IntTy2 &...vals) { … } /// Bind a pack of int& to the leading dimensions of shapedType.getShape(). template <typename... IntTy> static void bindShapeDims(ShapedType shapedType, IntTy &...vals) { … } namespace { bool isCastOfBlockArgument(Operation *op) { … } bool isSupportedPoolKind(vector::CombiningKind kind) { … } /// Generate a vector implementation for either: /// ``` /// Op def: ( w, kw ) /// Iters: ({Par(), Red()}) /// Layout: {{w + kw}, {kw}, {w}} /// ``` /// kw is unrolled. /// /// or /// /// ``` /// Op def: ( n, w, c, kw, f ) /// Iters: ({Par(), Par(), Par(), Red(), Red()}) /// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c, f}, {n, w, f}} /// ``` /// kw is unrolled, w is unrolled iff dilationW > 1. /// /// or /// /// ``` /// Op def: ( n, c, w, f, kw ) /// Iters: ({Par(), Par(), Par(), Red(), Red()}) /// Layout: {{n, c, strideW * w + dilationW * kw}, {f, c, kw}, {n, f, w}} /// ``` /// kw is unrolled, w is unrolled iff dilationW > 1. /// /// or /// /// ``` /// Op def: ( n, w, c, kw ) /// Iters: ({Par(), Par(), Par(), Red()}) /// Layout: {{n, strideW * w + dilationW * kw, c}, {kw, c}, {n, w, c}} /// ``` /// kw is unrolled, w is unrolled iff dilationW > 1. struct Conv1DGenerator : public StructuredGenerator<LinalgOp, utils::IteratorType> { … }; } // namespace /// Helper function to vectorize a LinalgOp with convolution semantics. // TODO: extend the generic vectorization to support windows and drop this. static FailureOr<Operation *> vectorizeConvolution( RewriterBase &rewriter, LinalgOp op, ArrayRef<int64_t> inputVecSizes, ArrayRef<bool> inputScalableVecDims, bool flatten1DDepthwiseConv) { … } struct VectorizeConvolution : public OpInterfaceRewritePattern<LinalgOp> { … }; void mlir::linalg::populateConvolutionVectorizationPatterns( RewritePatternSet &patterns, PatternBenefit benefit) { … }