//===- LowerVectorTranspose.cpp - Lower 'vector.transpose' operation ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements target-independent rewrites and utilities to lower the // 'vector.transpose' operation. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" #include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Location.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Interfaces/VectorInterfaces.h" #define DEBUG_TYPE … usingnamespacemlir; usingnamespacemlir::vector; /// Given a 'transpose' pattern, prune the rightmost dimensions that are not /// transposed. static void pruneNonTransposedDims(ArrayRef<int64_t> transpose, SmallVectorImpl<int64_t> &result) { … } /// Returns true if the lowering option is a vector shuffle based approach. static bool isShuffleLike(VectorTransposeLowering lowering) { … } /// Returns a shuffle mask that builds on `vals`. `vals` is the offset base of /// shuffle ops, i.e., the unpack pattern. The method iterates with `vals` to /// create the mask for `numBits` bits vector. The `numBits` have to be a /// multiple of 128. For example, if `vals` is {0, 1, 16, 17} and `numBits` is /// 512, there should be 16 elements in the final result. It constructs the /// below mask to get the unpack elements. /// [0, 1, 16, 17, /// 0+4, 1+4, 16+4, 17+4, /// 0+8, 1+8, 16+8, 17+8, /// 0+12, 1+12, 16+12, 17+12] static SmallVector<int64_t> getUnpackShufflePermFor128Lane(ArrayRef<int64_t> vals, int numBits) { … } /// Lower to vector.shuffle on v1 and v2 with UnpackLoPd shuffle mask. For /// example, if it is targeting 512 bit vector, returns /// vector.shuffle on v1, v2, [0, 1, 16, 17, /// 0+4, 1+4, 16+4, 17+4, /// 0+8, 1+8, 16+8, 17+8, /// 0+12, 1+12, 16+12, 17+12]. static Value createUnpackLoPd(ImplicitLocOpBuilder &b, Value v1, Value v2, int numBits) { … } /// Lower to vector.shuffle on v1 and v2 with UnpackHiPd shuffle mask. For /// example, if it is targeting 512 bit vector, returns /// vector.shuffle, v1, v2, [2, 3, 18, 19, /// 2+4, 3+4, 18+4, 19+4, /// 2+8, 3+8, 18+8, 19+8, /// 2+12, 3+12, 18+12, 19+12]. static Value createUnpackHiPd(ImplicitLocOpBuilder &b, Value v1, Value v2, int numBits) { … } /// Lower to vector.shuffle on v1 and v2 with UnpackLoPs shuffle mask. For /// example, if it is targeting 512 bit vector, returns /// vector.shuffle, v1, v2, [0, 16, 1, 17, /// 0+4, 16+4, 1+4, 17+4, /// 0+8, 16+8, 1+8, 17+8, /// 0+12, 16+12, 1+12, 17+12]. static Value createUnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2, int numBits) { … } /// Lower to vector.shuffle on v1 and v2 with UnpackHiPs shuffle mask. For /// example, if it is targeting 512 bit vector, returns /// vector.shuffle, v1, v2, [2, 18, 3, 19, /// 2+4, 18+4, 3+4, 19+4, /// 2+8, 18+8, 3+8, 19+8, /// 2+12, 18+12, 3+12, 19+12]. static Value createUnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2, int numBits) { … } /// Returns a vector.shuffle that shuffles 128-bit lanes (composed of 4 32-bit /// elements) selected by `mask` from `v1` and `v2`. I.e., /// /// DEFINE SELECT4(src, control) { /// CASE(control[1:0]) OF /// 0: tmp[127:0] := src[127:0] /// 1: tmp[127:0] := src[255:128] /// 2: tmp[127:0] := src[383:256] /// 3: tmp[127:0] := src[511:384] /// ESAC /// RETURN tmp[127:0] /// } /// dst[127:0] := SELECT4(v1[511:0], mask[1:0]) /// dst[255:128] := SELECT4(v1[511:0], mask[3:2]) /// dst[383:256] := SELECT4(v2[511:0], mask[5:4]) /// dst[511:384] := SELECT4(v2[511:0], mask[7:6]) static Value create4x128BitSuffle(ImplicitLocOpBuilder &b, Value v1, Value v2, uint8_t mask) { … } /// Lowers the value to a vector.shuffle op. The `source` is expected to be a /// 1-D vector and have `m`x`n` elements. static Value transposeToShuffle1D(OpBuilder &b, Value source, int m, int n) { … } /// Lowers the value to a sequence of vector.shuffle ops. The `source` is /// expected to be a 16x16 vector. static Value transposeToShuffle16x16(OpBuilder &builder, Value source, int m, int n) { … } namespace { /// Progressive lowering of TransposeOp. /// One: /// %x = vector.transpose %y, [1, 0] /// is replaced by: /// %z = arith.constant dense<0.000000e+00> /// %0 = vector.extract %y[0, 0] /// %1 = vector.insert %0, %z [0, 0] /// .. /// %x = vector.insert .., .. [.., ..] class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> { … }; /// Rewrites vector.transpose as vector.shape_cast. This pattern is only applied /// to 2D vectors with at least one unit dim. For example: /// /// Replace: /// vector.transpose %0, [1, 0] : vector<4x1xi32>> to /// vector<1x4xi32> /// with: /// vector.shape_cast %0 : vector<4x1xi32> to vector<1x4xi32> /// /// Source with leading unit dim (inverse) is also replaced. Unit dim must /// be fixed. Non-unit dim can be scalable. /// /// TODO: This pattern was introduced specifically to help lower scalable /// vectors. In hindsight, a more specialised canonicalization (for shape_cast's /// to cancel out) would be preferable: /// /// BEFORE: /// %0 = some_op /// %1 = vector.shape_cast %0 : vector<[4]xf32> to vector<[4]x1xf32> /// %2 = vector.transpose %1 [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32> /// AFTER: /// %0 = some_op /// %1 = vector.shape_cast %0 : vector<[4]xf32> to vector<1x[4]xf32> /// /// Given the context above, we may want to consider (re-)moving this pattern /// at some later time. I am leaving it for now in case there are other users /// that I am not aware of. class Transpose2DWithUnitDimToShapeCast : public OpRewritePattern<vector::TransposeOp> { … }; /// Rewrite a 2-D vector.transpose as a sequence of shuffle ops. /// If the strategy is Shuffle1D, it will be lowered to: /// vector.shape_cast 2D -> 1D /// vector.shuffle /// vector.shape_cast 1D -> 2D /// If the strategy is Shuffle16x16, it will be lowered to a sequence of shuffle /// ops on 16xf32 vectors. class TransposeOp2DToShuffleLowering : public OpRewritePattern<vector::TransposeOp> { … }; } // namespace void mlir::vector::populateVectorTransposeLoweringPatterns( RewritePatternSet &patterns, VectorTransformsOptions options, PatternBenefit benefit) { … }