//===- DataLayoutPropagation.cpp -----------------------------------------===/// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/Dominance.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/Debug.h" #include <optional> namespace mlir { #define GEN_PASS_DEF_LINALGDATALAYOUTPROPAGATION #include "mlir/Dialect/Linalg/Passes.h.inc" } // namespace mlir usingnamespacemlir; usingnamespacemlir::linalg; #define DEBUG_TYPE … namespace { static bool hasGatherSemantics(linalg::GenericOp genericOp) { … } // The struct contains the infomation about mapping packing information to // the iteration domain of Linalg ops. struct PackInfo { … }; template <typename OpTy> static FailureOr<PackInfo> getPackingInfoFromOperand(OpOperand *opOperand, linalg::GenericOp genericOp, OpTy packOrUnPackOp) { … } static SmallVector<int64_t> computeOuterDims(ArrayRef<int64_t> perm, ArrayRef<AffineExpr> exprs) { … } /// Returns a tuple for packed operand and indexing_map with the assumptions: /// 1) The generic op is the producer of the pack op. /// 2) The generic op has only one result. /// If the operand is a scalar or packing dimensions are all irrelevant to the /// operand, the operand and the updated indexing map will be returned. /// Otherwise, it returns the packed operand and the updated indexing map. E.g., /// /// #map0 = affine_map<(d0, d1) -> (d0, d1)> /// #map1 = affine_map<(d0, d1) -> (d0)> /// #map2 = affine_map<(d0, d1) -> (d1)> /// %0 = linalg.generic {indexing_maps = [#map1, #map2, #map0], /// iterator_types = ["parallel", "parallel"]} /// ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>) /// outs(%init : tensor<?x?xf32>) { /// ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): /// %4 = arith.addf %arg3, %arg4 : f32 /// linalg.yield %4 : f32 /// } -> tensor<?x?xf32> /// %1 = tensor.pack %0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor<?x?xf32> -> tensor<?x?x8x2xf32> /// /// Taking the first input operand as an example, the inner tile size of d1 is /// 8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> -> /// affine_map<(d1, d3)>` will be returned. /// /// %pack = tensor.pack %arg0 /// inner_dims_pos = [0] /// inner_tiles = [8] /// into %init : tensor<?xf32> -> tensor<?x8xf32> static std::tuple<Value, AffineMap> getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, GenericOp genericOp, OpOperand *opOperand) { … } /// Pack a genericOp and return it. static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, Value dest, AffineMap packedOutIndexingMap, const PackInfo &packInfo) { … } /// Bubbles up tensor.pack op through a producer generic op. This /// swap pack(generic) to generic(pack). The new generic op works on packed /// domain; pack ops are created for input and output operands. E.g., /// /// #map0 = affine_map<(d0, d1) -> (d0, d1)> /// %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32> /// %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32> /// %2 = tensor.empty(%0, %1) : tensor<?x?xf32> /// %3 = linalg.generic {indexing_maps = [#map0, #map0], /// iterator_types = ["parallel", "parallel"]} /// ins(%arg0 : tensor<?x?xf32>) /// outs(%2 : tensor<?x?xf32>) { /// ^bb0(%arg3: f32, %arg4: f32): /// %4 = arith.addf %arg3, %arg3 : f32 /// linalg.yield %4 : f32 /// } -> tensor<?x?xf32> /// %4 = tensor.pack %3 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor<?x?xf32> -> tensor<?x?x8x2xf32> /// /// will be converted to /// /// #map = affine_map<()[s0] -> (s0 ceildiv 8)> /// #map1 = affine_map<()[s0] -> (s0 ceildiv 2)> /// #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> /// %dim = tensor.dim %arg0, %c0 : tensor<?x?xf32> /// %dim_0 = tensor.dim %arg0, %c1 : tensor<?x?xf32> /// %0 = affine.apply #map()[%dim] /// %1 = affine.apply #map1()[%dim_0] /// %2 = tensor.empty(%0, %1) : tensor<?x?x8x2xf32> /// %pack = tensor.pack %arg0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %2 : tensor<?x?xf32> -> tensor<?x?x8x2xf32> /// %3 = linalg.generic {indexing_maps = [#map2, #map2], /// iterator_types = ["parallel", "parallel", "parallel", "parallel"]} /// ins(%pack : tensor<?x?x8x2xf32>) /// outs(%arg1 : tensor<?x?x8x2xf32>) { /// ^bb0(%in: f32, %out: f32): /// %4 = arith.addf %in, %in : f32 /// linalg.yield %4 : f32 /// } -> tensor<?x?x8x2xf32> static FailureOr<GenericOp> bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, const ControlPropagationFn &controlFn) { … } /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method. struct BubbleUpPackOpThroughGenericOpPattern : public OpRewritePattern<tensor::PackOp> { … }; /// Propagate a tensor.pack operation up through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. class BubbleUpPackThroughPadOp final : public OpRewritePattern<tensor::PackOp> { … }; /// Project dimsPos to the inner-most non-unit dim pos with reassocIndices. /// /// For example, given dimsPos [0, 2], reassocIndices [[0, 1], [2, 3]], and /// targetShape [16, 16, 32, 1], it returns [1, 2]. Because for pos 0, the /// inner-most projected dim in pos [0, 1] is 1. And for pos 2, the inner-most /// non-unit projected dims in pos [2, 3] is 2. /// /// If all candidates in a reassociation are unit dims, it chooses the /// inner-most dim pos. static SmallVector<int64_t> projectToInnerMostNonUnitDimsPos(ArrayRef<int64_t> dimsPos, ArrayRef<ReassociationIndices> reassocIndices, ArrayRef<int64_t> targetShape) { … } /// Check if all dims in dimsPos are divisible by the corresponding tile sizes. static bool isDimsDivisibleByTileSizes(ArrayRef<int64_t> dimsPos, ArrayRef<int64_t> shape, ArrayRef<int64_t> tileSizes) { … } /// Permutate the reassociation indices and reindex them in the sequence order. /// Returns the next dim pos in the sequence. /// /// For example, given reassocIndices [[0, 1], [2]] and permutation [1, 0], it /// applies the permutation to get [[2], [0, 1]] and reindexes the indices into /// [[0], [1, 2]]. static int64_t applyPermutationAndReindexReassoc( SmallVector<ReassociationIndices> &reassocIndices, ArrayRef<int64_t> permutation) { … } /// Bubble up pack op through collapse shape op when the packed dims can be /// projected to the dims before collapsing. This is possible when the inner /// tile sizes can divide the projected dims. /// /// For example: /// /// %collapsed = tensor.collapse_shape %in [[0, 1], 2] /// : tensor<?x16x4xf32> into tensor<?x4xf32> /// %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %empty /// : tensor<?x4xf32> -> tensor<?x4x8x1xf32> /// /// can be transformed into: /// /// %pack = tensor.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %empty /// : tensor<?x16x4xf32> -> tensor<?x2x4x8x1xf32> /// %collapsed = tensor.collapse_shape %pack [[0, 1], 2, 3, 4] /// : tensor<?x2x4x8x1xf32> into tensor<?x4x8x1> static LogicalResult bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp, tensor::PackOp packOp, PatternRewriter &rewriter) { … } /// Project dimsPos to their collapsed positions in the reassocIndices. /// /// For example, given dimsPos [0, 1, 2, 4], and matching reassocIndices /// [[0], [1, 2], [3], [4]], it returns [0, 1, 1, 3]. Because for pos 0, /// the reassoc dim [0] is 0. For pos 1 and 2, the reassoc dim in pos /// [1, 2] is 1. And for pos 4, the reassoc dim [4] is 3. static SmallVector<int64_t> projectDimsPosIntoReassocPos(ArrayRef<int64_t> dimsPos, ArrayRef<ReassociationIndices> reassocIndices) { … } /// Bubble up pack op through expand shape op. /// /// For example: /// /// %expand = tensor.expand_shape %in [[0], [1, 2]] /// : tensor<?x64xf32> into tensor<?x4x16xf32> /// %pack = tensor.pack %expand outer_dims_perm = [0, 1] /// inner_dims_pos = [2] inner_tiles = [8] into %empty /// : tensor<?x4x16xf32> -> tensor<?x4x2x8xf32> /// /// can be transformed into: /// /// %pack = tensor.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1] inner_tiles = [8] into %empty /// : tensor<?x64xf32> -> tensor<?x8x8xf32> /// %expand = tensor.expand_shape %pack [[0], [1, 2], [3]] /// : tensor<?x8x8xf32> into tensor<?x4x2x8xf32> static LogicalResult bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, tensor::PackOp packOp, PatternRewriter &rewriter) { … } class BubbleUpPackOpThroughReshapeOp final : public OpRewritePattern<tensor::PackOp> { … }; /// Push down unpack op through expand shape op when the packed dims can be /// projected to the dims after expanding. This is possible when the inner tile /// sizes can divide the projected dims. /// /// For example: /// /// %unpack = tensor.unpack %in outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %empty /// : tensor<?x32x8x8xf32> -> tensor<?x256xf32> /// %expanded = tensor.expand_shape %unpack [[0, 1], [2]] /// : tensor<?x256xf32> into tensor<?x256x256xf32> /// /// can be transformed into: /// /// %expanded = tensor.expand_shape %ain [[0, 1], [2], [3], [4]] /// : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32> /// %unpack = tensor.unpack %expanded outer_dims_perm = [0, 1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %empty /// : tensor<?x32x32x8x8xf32> -> tensor<?x256x256xf32> static LogicalResult pushDownUnPackOpThroughExpandShape( tensor::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp, PatternRewriter &rewriter, ControlPropagationFn controlFn) { … } class PushDownUnPackOpThroughReshapeOp final : public OpRewritePattern<tensor::UnPackOp> { … }; // TODO: Relax this restriction. We should unpack a generic op also // in the presence of multiple unpack ops as producers. /// Return the unpacked operand, if present, for the current generic op. static FailureOr<OpOperand *> getUnPackedOperand(GenericOp genericOp) { … } /// Push down a tensor.unpack op through a generic op. /// The new generic op works on packed domain; pack ops are created for input /// and output operands. A tensor.unpack op is inserted right after the packed /// generic. E.g. /// /// #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> /// /// %arg0 = tensor<12x2x56x56x32xf32> // packed arg. /// /// %0 = tensor.empty() : tensor<12x56x56x64xf32> /// %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// %2 = linalg.generic {indexing_maps = [#map], /// iterator_types = ["parallel", "parallel", "parallel", "parallel"]} /// outs(%1 : tensor<12x56x56x64xf32>) { /// ^bb0(%out : f32): /// linalg.yield %out : f32 /// } -> tensor<12x56x56x64xf32> /// /// will be converted to /// /// #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> /// /// %0 = tensor.empty() : tensor<12x56x56x64xf32> /// %1 = linalg.generic {indexing_maps = [#map], /// iterator_types = ["parallel", "parallel", "parallel", /// "parallel", "parallel"]} /// outs(%arg0 : tensor<12x2x56x56x32xf32>) { /// ^bb0(%out : f32): /// linalg.yield %out : f32 /// } -> tensor<12x2x56x56x32xf32> /// %2 = tensor.unpack %1 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// static FailureOr<std::tuple<GenericOp, Value>> pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, ControlPropagationFn controlFn) { … } // Wrapper pattern that applies pushDownUnPackOpThroughGenericOp method. struct PushDownUnPackOpThroughGenericOp : public OpRewritePattern<GenericOp> { … }; /// Propagate a tensor.unpack operation through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. struct PushDownUnPackThroughPadOp : public OpRewritePattern<tensor::PadOp> { … }; } // namespace void mlir::linalg::populateDataLayoutPropagationPatterns( RewritePatternSet &patterns, const ControlPropagationFn &controlPackUnPackPropagation) { … }