HoistPadding.cpp | Explore in Territory

//===- HoistPadding.cpp - Hoisting for tensor::PadOp ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements functions concerned with hoisting padding operations.
//
//===----------------------------------------------------------------------===//

#include "mlir/Analysis/Presburger/IntegerRelation.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/Transforms/Transforms.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/Utils/Utils.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/IR/AsmState.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"

dbgs;

#define DEBUG_TYPE …

#define DBGS() …

usingnamespacemlir;
usingnamespacemlir::linalg;
usingnamespacemlir::linalg::detail;

#ifndef NDEBUG
static bool debugPrintLoopInShortForm(Operation *op) {
  AsmState state(op->getParentOfType<func::FuncOp>());
  (void)state;
  if (auto forOp = dyn_cast<scf::ForOp>(op)) {
    forOp.getInductionVar().printAsOperand(dbgs(), state);
    dbgs() << " @ " << forOp.getOperation();
    return true;
  }
  return false;
}
#endif

static void debugPrintBackwardSlice(SetVector<Operation *> &backwardSlice) { … }

/// Return at most nLevels of immediately enclosing scf::ForOp loops.
/// Stops at the first parent that is not an scf::ForOp.
/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
/// Control-flow and other containing ops with regions are not modeled atm.
static void
getAtMostNEnclosingLoops(tensor::PadOp padOp, int nLevels,
                         SmallVector<scf::ForOp> &reverseEnclosingLoops) { … }

/// Return at most nLevels of immediately enclosing scf::ForOp loops.
/// Stops at the first parent that is not an scf::ForOp.
/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
/// Control-flow and other containing ops with regions are not modeled atm.
static void
getEnclosingLoopsUntil(tensor::PadOp padOp, scf::ForOp untilLoop,
                       SmallVector<scf::ForOp> &reverseEnclosingLoops) { … }

// Get all the ops in the backwards slice starting from `padOp` and that
// are dominated by the outermost enclosing loop.
// This also requires tracking ops defining values used in the region but
// defined above.
static void computeBackwardSlice(tensor::PadOp padOp,
                                 scf::ForOp outermostEnclosingForOp,
                                 SetVector<Operation *> &backwardSlice) { … }

//===----------------------------------------------------------------------===//
// HoistPaddingAnalysis Implementation.
//===----------------------------------------------------------------------===//

namespace {
/// Analysis class to support tensor::PadOp hoisting across multiple enclosing
/// loops. The failure conditions are:
///   1. Pad op has a use that is not an input of a LinalgOp.
///   2. Pad op does not have a constant padding value.
///   3. There is no immediately enclosing scf::ForOp.
///   4. The backward slice from the pad op to the scf::ForOp to hoist above
///      contains an unknown op with non index type operands, a region, or a
///      memory effect.
///   5. The backward slice from the pad op to the scf::ForOp to hoist above is
///      empty.
///   6. The source tensor of pad op is not defined by an extract slice op.
///   7. The source tensor of the extract slice op is not defined outside of
///      the outermost enclosing scf::ForOp.
///   8. There is no enclosing scf::ForOp that indexes the padded data.
/// Other cases succeed and will trigger hoisting of the pad op.
struct HoistPaddingAnalysis { … };

} // namespace

HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp, int numLoops)
    : … { … }

HoistPaddingAnalysis::HoistPaddingAnalysis(tensor::PadOp padOp,
                                           scf::ForOp outermostEnclosingForOp)
    : … { … }

void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) { … }

void HoistPaddingAnalysis::finalizeHoistPaddingAnalysis() { … }

LogicalResult HoistPaddingAnalysis::dropNonIndexDependencies() { … }

SmallVector<Value>
HoistPaddingAnalysis::getHoistedPackedTensorSizes(RewriterBase &rewriter,
                                                  Location loc) const { … }

static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) { … }

//===----------------------------------------------------------------------===//
// buildPackingLoopNest Implementation.
//===----------------------------------------------------------------------===//

/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
/// The returned Value is guaranteed not to depend on any loop comprised in
/// [`outer`, `forOp`].
/// Return null if such a loop-independent quantity cannot be computed.
static Value buildLoopIterationCount(RewriterBase &rewriter, scf::ForOp outer,
                                     scf::ForOp forOp) { … }

// Build a packing loop nest by iteratively traversing the backward slice and
// clone the operations, iteratively stepping into the loops that we encounter.
// The implementation proceeds in a stack-like fashion:
//   1. Iteratively clone and step into the loops, pushing the
//   `hoistedPackedTensor`
//      deeper in the stack.
//   2. At the innermost loop level, create a GenericOp if `transposeVector` is
//      non-empty.
//   3. At the innermost loop level, create a InsertSliceOp.
//   4. Iteratively pop and yield the result of the InsertSliceOp across the
//      cloned loops.
static FailureOr<PackingResult> buildPackingLoopNestImpl(
    RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
    ArrayRef<int64_t> transposeVector, RankedTensorType transposedTensorType,
    tensor::EmptyOp emptyOp, const HoistPaddingAnalysis &analysis) { … }

/// Build the packing loop nest required to hoist `opToHoist` above
/// `outermostEnclosingForOp`.
/// The loop nest is built just before `outermostEnclosingForOp`.
static FailureOr<PackingResult> buildPackingLoopNestImpl(
    RewriterBase &rewriter, IRMapping &bvm, tensor::PadOp opToHoist,
    ArrayRef<int64_t> transposeVector, const HoistPaddingAnalysis &analysis) { … }

/// Build the packing loop nest required to hoist `opToHoist` above
/// `outermostEnclosingForOp`.
/// The loop nest is built just before `outermostEnclosingForOp`.
FailureOr<PackingResult> mlir::linalg::detail::buildPackingLoopNest(
    RewriterBase &rewriter, tensor::PadOp opToHoist,
    scf::ForOp outermostEnclosingForOp, ArrayRef<int64_t> transposeVector) { … }

//===----------------------------------------------------------------------===//
// hoistPaddingOnTensors Implementation.
//===----------------------------------------------------------------------===//

/// Return true if we can walk back the use-def chain from `extractSliceOp` to
/// expectedSource going through DestinationStyleOpInterface inits only.
/// This is a poor man's analysis that is sufficient to check the extractSliceOp
/// the matches tensor.pad we want to hoist.
/// In the future, it will be easier to ensure this with a matching symmetric
/// tensor.unpad op.
static bool tracesBackToExpectedValue(tensor::ExtractSliceOp extractSliceOp,
                                      Value expectedSource) { … }

/// If the original consumer of `outerSliceOp` was a `forOp` (i.e. through an
/// iter arg), propagate the `hoistedPackedTensor` value through the same iter
/// arg.
/// TODO: for multiple loops we need to track the use to the innermost loop.
///
/// Match:
/// ```
///   %outerSliceOp = tensor.extract_slice ..
///   %f = scf.for ... iter_args(%arg0 = %outerSliceOp) {
///     %hoistedPackedTensor = tensor.pad %arg0
///     %1 = compute %hoistedPackedTensor
///     %2 = tensor.extract_slice %1
///     scf.yield %2
///   }
/// ```
///
/// and rewrite as:
/// ```
///   %outerSliceOp = tensor.extract_slice ..
///   %hoistedPackedTensor = tensor.pad %outerSliceOp
///   %f = scf.for ... iter_args(%arg0 = %hoistedPackedTensor) {
///     %1 = compute %arg0
///     scf.yield %1
///   }
///   %2 = tensor.extract_slice %forOp
/// ```
///
/// Return null when no rewrite happened.
static tensor::ExtractSliceOp
padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
                      Value hoistedPackedTensor,
                      tensor::ExtractSliceOp outerSliceOp, scf::ForOp forOp) { … }

/// Produce a tensor extracted from the packingResult. This can be used as a
/// replacement for `opToHoist` in callers.
static Value replaceByPackingResult(RewriterBase &rewriter,
                                    const IRMapping &bvm,
                                    tensor::PadOp opToHoist,
                                    RankedTensorType transposedTensorType,
                                    const HoistPaddingAnalysis &analysis,
                                    const PackingResult &packingResult) { … }

FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
    RewriterBase &rewriter, tensor::PadOp opToHoist, int64_t numLoops,
    ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
    SmallVectorImpl<TransposeOp> &transposeOps) { … }

FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(
    tensor::PadOp opToHoist, int64_t numLoops,
    ArrayRef<int64_t> transposeVector, tensor::PadOp &hoistedOp,
    SmallVectorImpl<TransposeOp> &transposeOps) { … }
llvm/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp