LowerVectorTranspose.cpp | Explore in Territory

//===- LowerVectorTranspose.cpp - Lower 'vector.transpose' operation ------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements target-independent rewrites and utilities to lower the
// 'vector.transpose' operation.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
#include "mlir/IR/BuiltinAttributeInterfaces.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/VectorInterfaces.h"

#define DEBUG_TYPE …

usingnamespacemlir;
usingnamespacemlir::vector;

/// Given a 'transpose' pattern, prune the rightmost dimensions that are not
/// transposed.
static void pruneNonTransposedDims(ArrayRef<int64_t> transpose,
                                   SmallVectorImpl<int64_t> &result) { … }

/// Returns true if the lowering option is a vector shuffle based approach.
static bool isShuffleLike(VectorTransposeLowering lowering) { … }

/// Returns a shuffle mask that builds on `vals`. `vals` is the offset base of
/// shuffle ops, i.e., the unpack pattern. The method iterates with `vals` to
/// create the mask for `numBits` bits vector. The `numBits` have to be a
/// multiple of 128. For example, if `vals` is {0, 1, 16, 17} and `numBits` is
/// 512, there should be 16 elements in the final result. It constructs the
/// below mask to get the unpack elements.
///   [0,    1,    16,    17,
///    0+4,  1+4,  16+4,  17+4,
///    0+8,  1+8,  16+8,  17+8,
///    0+12, 1+12, 16+12, 17+12]
static SmallVector<int64_t>
getUnpackShufflePermFor128Lane(ArrayRef<int64_t> vals, int numBits) { … }

/// Lower to vector.shuffle on v1 and v2 with UnpackLoPd shuffle mask. For
/// example, if it is targeting 512 bit vector, returns
///   vector.shuffle on v1, v2, [0,    1,    16,    17,
///                              0+4,  1+4,  16+4,  17+4,
///                              0+8,  1+8,  16+8,  17+8,
///                              0+12, 1+12, 16+12, 17+12].
static Value createUnpackLoPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
                              int numBits) { … }

/// Lower to vector.shuffle on v1 and v2 with UnpackHiPd shuffle mask. For
/// example, if it is targeting 512 bit vector, returns
///   vector.shuffle, v1, v2, [2,    3,    18,    19,
///                            2+4,  3+4,  18+4,  19+4,
///                            2+8,  3+8,  18+8,  19+8,
///                            2+12, 3+12, 18+12, 19+12].
static Value createUnpackHiPd(ImplicitLocOpBuilder &b, Value v1, Value v2,
                              int numBits) { … }

/// Lower to vector.shuffle on v1 and v2 with UnpackLoPs shuffle mask. For
/// example, if it is targeting 512 bit vector, returns
///   vector.shuffle, v1, v2, [0,    16,    1,    17,
///                            0+4,  16+4,  1+4,  17+4,
///                            0+8,  16+8,  1+8,  17+8,
///                            0+12, 16+12, 1+12, 17+12].
static Value createUnpackLoPs(ImplicitLocOpBuilder &b, Value v1, Value v2,
                              int numBits) { … }

/// Lower to vector.shuffle on v1 and v2 with UnpackHiPs shuffle mask. For
/// example, if it is targeting 512 bit vector, returns
///   vector.shuffle, v1, v2, [2,    18,    3,    19,
///                            2+4,  18+4,  3+4,  19+4,
///                            2+8,  18+8,  3+8,  19+8,
///                            2+12, 18+12, 3+12, 19+12].
static Value createUnpackHiPs(ImplicitLocOpBuilder &b, Value v1, Value v2,
                              int numBits) { … }

/// Returns a vector.shuffle that shuffles 128-bit lanes (composed of 4 32-bit
/// elements) selected by `mask` from `v1` and `v2`. I.e.,
///
/// DEFINE SELECT4(src, control) {
///	CASE(control[1:0]) OF
///	0:	tmp[127:0] := src[127:0]
///	1:	tmp[127:0] := src[255:128]
///	2:	tmp[127:0] := src[383:256]
///	3:	tmp[127:0] := src[511:384]
///	ESAC
///	RETURN tmp[127:0]
/// }
/// dst[127:0]   := SELECT4(v1[511:0], mask[1:0])
/// dst[255:128] := SELECT4(v1[511:0], mask[3:2])
/// dst[383:256] := SELECT4(v2[511:0], mask[5:4])
/// dst[511:384] := SELECT4(v2[511:0], mask[7:6])
static Value create4x128BitSuffle(ImplicitLocOpBuilder &b, Value v1, Value v2,
                                  uint8_t mask) { … }

/// Lowers the value to a vector.shuffle op. The `source` is expected to be a
/// 1-D vector and have `m`x`n` elements.
static Value transposeToShuffle1D(OpBuilder &b, Value source, int m, int n) { … }

/// Lowers the value to a sequence of vector.shuffle ops. The `source` is
/// expected to be a 16x16 vector.
static Value transposeToShuffle16x16(OpBuilder &builder, Value source, int m,
                                     int n) { … }

namespace {
/// Progressive lowering of TransposeOp.
/// One:
///   %x = vector.transpose %y, [1, 0]
/// is replaced by:
///   %z = arith.constant dense<0.000000e+00>
///   %0 = vector.extract %y[0, 0]
///   %1 = vector.insert %0, %z [0, 0]
///   ..
///   %x = vector.insert .., .. [.., ..]
class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> { … };

/// Rewrites vector.transpose as vector.shape_cast. This pattern is only applied
/// to 2D vectors with at least one unit dim. For example:
///
/// Replace:
///   vector.transpose %0, [1, 0] : vector<4x1xi32>> to
///                                 vector<1x4xi32>
/// with:
///   vector.shape_cast %0 : vector<4x1xi32> to vector<1x4xi32>
///
/// Source with leading unit dim (inverse) is also replaced. Unit dim must
/// be fixed. Non-unit dim can be scalable.
///
/// TODO: This pattern was introduced specifically to help lower scalable
/// vectors. In hindsight, a more specialised canonicalization (for shape_cast's
/// to cancel out) would be preferable:
///
///  BEFORE:
///     %0 = some_op
///     %1 = vector.shape_cast %0 : vector<[4]xf32> to vector<[4]x1xf32>
///     %2 = vector.transpose %1 [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
///  AFTER:
///     %0 = some_op
///     %1 = vector.shape_cast %0 : vector<[4]xf32> to vector<1x[4]xf32>
///
/// Given the context above, we may want to consider (re-)moving this pattern
/// at some later time. I am leaving it for now in case there are other users
/// that I am not aware of.
class Transpose2DWithUnitDimToShapeCast
    : public OpRewritePattern<vector::TransposeOp> { … };

/// Rewrite a 2-D vector.transpose as a sequence of shuffle ops.
/// If the strategy is Shuffle1D, it will be lowered to:
///   vector.shape_cast 2D -> 1D
///   vector.shuffle
///   vector.shape_cast 1D -> 2D
/// If the strategy is Shuffle16x16, it will be lowered to a sequence of shuffle
/// ops on 16xf32 vectors.
class TransposeOp2DToShuffleLowering
    : public OpRewritePattern<vector::TransposeOp> { … };
} // namespace

void mlir::vector::populateVectorTransposeLoweringPatterns(
    RewritePatternSet &patterns, VectorTransformsOptions options,
    PatternBenefit benefit) { … }
llvm/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp