LoopVersioning.cpp | Explore in Territory

//===- LoopVersioning.cpp -------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
/// \file
/// This pass looks for loops iterating over assumed-shape arrays, that can
/// be optimized by "guessing" that the stride is element-sized.
///
/// This is done by creating two versions of the same loop: one which assumes
/// that the elements are contiguous (stride == size of element), and one that
/// is the original generic loop.
///
/// As a side-effect of the assumed element size stride, the array is also
/// flattened to make it a 1D array - this is because the internal array
/// structure must be either 1D or have known sizes in all dimensions - and at
/// least one of the dimensions here is already unknown.
///
/// There are two distinct benefits here:
/// 1. The loop that iterates over the elements is somewhat simplified by the
///    constant stride calculation.
/// 2. Since the compiler can understand the size of the stride, it can use
///    vector instructions, where an unknown (at compile time) stride does often
///    prevent vector operations from being used.
///
/// A known drawback is that the code-size is increased, in some cases that can
/// be quite substantial - 3-4x is quite plausible (this includes that the loop
/// gets vectorized, which in itself often more than doubles the size of the
/// code, because unless the loop size is known, there will be a modulo
/// vector-size remainder to deal with.
///
/// TODO: Do we need some size limit where loops no longer get duplicated?
//        Maybe some sort of cost analysis.
/// TODO: Should some loop content - for example calls to functions and
///       subroutines inhibit the versioning of the loops. Plausibly, this
///       could be part of the cost analysis above.
//===----------------------------------------------------------------------===//

#include "flang/ISO_Fortran_binding_wrapper.h"
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/Inquiry.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/Dialect/Support/KindMapping.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"

#include <algorithm>

namespace fir {
#define GEN_PASS_DEF_LOOPVERSIONING
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

#define DEBUG_TYPE "flang-loop-versioning"

namespace {

class LoopVersioningPass
    : public fir::impl::LoopVersioningBase<LoopVersioningPass> {
public:
  void runOnOperation() override;
};

/// @struct ArgInfo
/// A structure to hold an argument, the size of the argument and dimension
/// information.
struct ArgInfo {
  mlir::Value arg;
  size_t size;
  unsigned rank;
  fir::BoxDimsOp dims[CFI_MAX_RANK];
};

/// @struct ArgsUsageInLoop
/// A structure providing information about the function arguments
/// usage by the instructions immediately nested in a loop.
struct ArgsUsageInLoop {
  /// Mapping between the memref operand of an array indexing
  /// operation (e.g. fir.coordinate_of) and the argument information.
  llvm::DenseMap<mlir::Value, ArgInfo> usageInfo;
  /// Some array indexing operations inside a loop cannot be transformed.
  /// This vector holds the memref operands of such operations.
  /// The vector is used to make sure that we do not try to transform
  /// any outer loop, since this will imply the operation rewrite
  /// in this loop.
  llvm::SetVector<mlir::Value> cannotTransform;

  // Debug dump of the structure members assuming that
  // the information has been collected for the given loop.
  void dump(fir::DoLoopOp loop) const {
    LLVM_DEBUG({
      mlir::OpPrintingFlags printFlags;
      printFlags.skipRegions();
      llvm::dbgs() << "Arguments usage info for loop:\n";
      loop.print(llvm::dbgs(), printFlags);
      llvm::dbgs() << "\nUsed args:\n";
      for (auto &use : usageInfo) {
        mlir::Value v = use.first;
        v.print(llvm::dbgs(), printFlags);
        llvm::dbgs() << "\n";
      }
      llvm::dbgs() << "\nCannot transform args:\n";
      for (mlir::Value arg : cannotTransform) {
        arg.print(llvm::dbgs(), printFlags);
        llvm::dbgs() << "\n";
      }
      llvm::dbgs() << "====\n";
    });
  }

  // Erase usageInfo and cannotTransform entries for a set
  // of given arguments.
  void eraseUsage(const llvm::SetVector<mlir::Value> &args) {
    for (auto &arg : args)
      usageInfo.erase(arg);
    cannotTransform.set_subtract(args);
  }

  // Erase usageInfo and cannotTransform entries for a set
  // of given arguments provided in the form of usageInfo map.
  void eraseUsage(const llvm::DenseMap<mlir::Value, ArgInfo> &args) {
    for (auto &arg : args) {
      usageInfo.erase(arg.first);
      cannotTransform.remove(arg.first);
    }
  }
};
} // namespace

static fir::SequenceType getAsSequenceType(mlir::Value *v) {
  mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
  return mlir::dyn_cast<fir::SequenceType>(argTy);
}

/// if a value comes from a fir.declare, follow it to the original source,
/// otherwise return the value
static mlir::Value unwrapFirDeclare(mlir::Value val) {
  // fir.declare is for source code variables. We don't have declares of
  // declares
  if (fir::DeclareOp declare = val.getDefiningOp<fir::DeclareOp>())
    return declare.getMemref();
  return val;
}

/// if a value comes from a fir.rebox, follow the rebox to the original source,
/// of the value, otherwise return the value
static mlir::Value unwrapReboxOp(mlir::Value val) {
  // don't support reboxes of reboxes
  if (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>())
    val = rebox.getBox();
  return val;
}

/// normalize a value (removing fir.declare and fir.rebox) so that we can
/// more conveniently spot values which came from function arguments
static mlir::Value normaliseVal(mlir::Value val) {
  return unwrapFirDeclare(unwrapReboxOp(val));
}

/// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift.
/// fir.shift and fir.shapeshift allow us to extract lower bounds
/// if lowerbounds cannot be found, return nullptr
static mlir::Value tryGetLowerBoundsFromShapeLike(mlir::Value shapeLike,
                                                  unsigned dim) {
  mlir::Value lowerBound{nullptr};
  if (auto shift = shapeLike.getDefiningOp<fir::ShiftOp>())
    lowerBound = shift.getOrigins()[dim];
  if (auto shapeShift = shapeLike.getDefiningOp<fir::ShapeShiftOp>())
    lowerBound = shapeShift.getOrigins()[dim];
  return lowerBound;
}

/// attempt to get the array lower bounds of dimension dim of the memref
/// argument to a fir.array_coor op
/// 0 <= dim < rank
/// May return nullptr if no lower bounds can be determined
static mlir::Value getLowerBound(fir::ArrayCoorOp coop, unsigned dim) {
  // 1) try to get from the shape argument to fir.array_coor
  if (mlir::Value shapeLike = coop.getShape())
    if (mlir::Value lb = tryGetLowerBoundsFromShapeLike(shapeLike, dim))
      return lb;

  // It is important not to try to read the lower bound from the box, because
  // in the FIR lowering, boxes will sometimes contain incorrect lower bound
  // information

  // out of ideas
  return {};
}

/// gets the i'th index from array coordinate operation op
/// dim should range between 0 and rank - 1
static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op,
                            unsigned dim) {
  if (fir::CoordinateOp coop = mlir::dyn_cast<fir::CoordinateOp>(op))
    return coop.getCoor()[dim];

  fir::ArrayCoorOp coop = mlir::dyn_cast<fir::ArrayCoorOp>(op);
  assert(coop &&
         "operation must be either fir.coordiante_of or fir.array_coor");

  // fir.coordinate_of indices start at 0: adjust these indices to match by
  // subtracting the lower bound
  mlir::Value index = coop.getIndices()[dim];
  mlir::Value lb = getLowerBound(coop, dim);
  if (!lb)
    // assume a default lower bound of one
    lb = builder.createIntegerConstant(coop.getLoc(), index.getType(), 1);

  // index_0 = index - lb;
  if (lb.getType() != index.getType())
    lb = builder.createConvert(coop.getLoc(), index.getType(), lb);
  return builder.create<mlir::arith::SubIOp>(coop.getLoc(), index, lb);
}

void LoopVersioningPass::runOnOperation() {
  LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
  mlir::func::FuncOp func = getOperation();

  // First look for arguments with assumed shape = unknown extent in the lowest
  // dimension.
  LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n");
  mlir::Block::BlockArgListType args = func.getArguments();
  mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
  fir::KindMapping kindMap = fir::getKindMapping(module);
  mlir::SmallVector<ArgInfo, 4> argsOfInterest;
  std::optional<mlir::DataLayout> dl =
      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
  if (!dl)
    mlir::emitError(module.getLoc(),
                    "data layout attribute is required to perform " DEBUG_TYPE
                    "pass");
  for (auto &arg : args) {
    // Optional arguments must be checked for IsPresent before
    // looking for the bounds. They are unsupported for the time being.
    if (func.getArgAttrOfType<mlir::UnitAttr>(arg.getArgNumber(),
                                              fir::getOptionalAttrName())) {
      LLVM_DEBUG(llvm::dbgs() << "OPTIONAL is not supported\n");
      continue;
    }

    if (auto seqTy = getAsSequenceType(&arg)) {
      unsigned rank = seqTy.getDimension();
      if (rank > 0 &&
          seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) {
        size_t typeSize = 0;
        mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType());
        if (mlir::isa<mlir::FloatType>(elementType) ||
            mlir::isa<mlir::IntegerType>(elementType) ||
            mlir::isa<fir::ComplexType>(elementType)) {
          auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash(
              arg.getLoc(), elementType, *dl, kindMap);
          typeSize = llvm::alignTo(eleSize, eleAlign);
        }
        if (typeSize)
          argsOfInterest.push_back({arg, typeSize, rank, {}});
        else
          LLVM_DEBUG(llvm::dbgs() << "Type not supported\n");
      }
    }
  }

  if (argsOfInterest.empty()) {
    LLVM_DEBUG(llvm::dbgs()
               << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n");
    return;
  }

  // A list of all loops in the function in post-order.
  mlir::SmallVector<fir::DoLoopOp> originalLoops;
  // Information about the arguments usage by the instructions
  // immediately nested in a loop.
  llvm::DenseMap<fir::DoLoopOp, ArgsUsageInLoop> argsInLoops;

  auto &domInfo = getAnalysis<mlir::DominanceInfo>();

  // Traverse the loops in post-order and see
  // if those arguments are used inside any loop.
  func.walk([&](fir::DoLoopOp loop) {
    mlir::Block &body = *loop.getBody();
    auto &argsInLoop = argsInLoops[loop];
    originalLoops.push_back(loop);
    body.walk([&](mlir::Operation *op) {
      // Support either fir.array_coor or fir.coordinate_of.
      if (!mlir::isa<fir::ArrayCoorOp, fir::CoordinateOp>(op))
        return;
      // Process only operations immediately nested in the current loop.
      if (op->getParentOfType<fir::DoLoopOp>() != loop)
        return;
      mlir::Value operand = op->getOperand(0);
      for (auto a : argsOfInterest) {
        if (a.arg == normaliseVal(operand)) {
          // Use the reboxed value, not the block arg when re-creating the loop.
          a.arg = operand;

          // Check that the operand dominates the loop?
          // If this is the case, record such operands in argsInLoop.cannot-
          // Transform, so that they disable the transformation for the parent
          /// loops as well.
          if (!domInfo.dominates(a.arg, loop))
            argsInLoop.cannotTransform.insert(a.arg);

          // No support currently for sliced arrays.
          // This means that we cannot transform properly
          // instructions referencing a.arg in the whole loop
          // nest this loop is located in.
          if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op))
            if (arrayCoor.getSlice())
              argsInLoop.cannotTransform.insert(a.arg);

          if (argsInLoop.cannotTransform.contains(a.arg)) {
            // Remove any previously recorded usage, if any.
            argsInLoop.usageInfo.erase(a.arg);
            break;
          }

          // Record the a.arg usage, if not recorded yet.
          argsInLoop.usageInfo.try_emplace(a.arg, a);
          break;
        }
      }
    });
  });

  // Dump loops info after initial collection.
  LLVM_DEBUG({
    llvm::dbgs() << "Initial usage info:\n";
    for (fir::DoLoopOp loop : originalLoops) {
      auto &argsInLoop = argsInLoops[loop];
      argsInLoop.dump(loop);
    }
  });

  // Clear argument usage for parent loops if an inner loop
  // contains a non-transformable usage.
  for (fir::DoLoopOp loop : originalLoops) {
    auto &argsInLoop = argsInLoops[loop];
    if (argsInLoop.cannotTransform.empty())
      continue;

    fir::DoLoopOp parent = loop;
    while ((parent = parent->getParentOfType<fir::DoLoopOp>()))
      argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform);
  }

  // If an argument access can be optimized in a loop and
  // its descendant loop, then it does not make sense to
  // generate the contiguity check for the descendant loop.
  // The check will be produced as part of the ancestor
  // loop's transformation. So we can clear the argument
  // usage for all descendant loops.
  for (fir::DoLoopOp loop : originalLoops) {
    auto &argsInLoop = argsInLoops[loop];
    if (argsInLoop.usageInfo.empty())
      continue;

    loop.getBody()->walk([&](fir::DoLoopOp dloop) {
      argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo);
    });
  }

  LLVM_DEBUG({
    llvm::dbgs() << "Final usage info:\n";
    for (fir::DoLoopOp loop : originalLoops) {
      auto &argsInLoop = argsInLoops[loop];
      argsInLoop.dump(loop);
    }
  });

  // Reduce the collected information to a list of loops
  // with attached arguments usage information.
  // The list must hold the loops in post order, so that
  // the inner loops are transformed before the outer loops.
  struct OpsWithArgs {
    mlir::Operation *op;
    mlir::SmallVector<ArgInfo, 4> argsAndDims;
  };
  mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest;
  for (fir::DoLoopOp loop : originalLoops) {
    auto &argsInLoop = argsInLoops[loop];
    if (argsInLoop.usageInfo.empty())
      continue;
    OpsWithArgs info;
    info.op = loop;
    for (auto &arg : argsInLoop.usageInfo)
      info.argsAndDims.push_back(arg.second);
    loopsOfInterest.emplace_back(std::move(info));
  }

  if (loopsOfInterest.empty()) {
    LLVM_DEBUG(llvm::dbgs()
               << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n");
    return;
  }

  // If we get here, there are loops to process.
  fir::FirOpBuilder builder{module, std::move(kindMap)};
  mlir::Location loc = builder.getUnknownLoc();
  mlir::IndexType idxTy = builder.getIndexType();

  LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:");
  LLVM_DEBUG(module->dump());

  LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size()
                          << "\n");
  for (auto op : loopsOfInterest) {
    LLVM_DEBUG(op.op->dump());
    builder.setInsertionPoint(op.op);

    mlir::Value allCompares = nullptr;
    // Ensure all of the arrays are unit-stride.
    for (auto &arg : op.argsAndDims) {
      // Fetch all the dimensions of the array, except the last dimension.
      // Always fetch the first dimension, however, so set ndims = 1 if
      // we have one dim
      unsigned ndims = arg.rank;
      for (unsigned i = 0; i < ndims; i++) {
        mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i);
        arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy,
                                                     arg.arg, dimIdx);
      }
      // We only care about lowest order dimension, here.
      mlir::Value elemSize =
          builder.createIntegerConstant(loc, idxTy, arg.size);
      mlir::Value cmp = builder.create<mlir::arith::CmpIOp>(
          loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2),
          elemSize);
      if (!allCompares) {
        allCompares = cmp;
      } else {
        allCompares =
            builder.create<mlir::arith::AndIOp>(loc, cmp, allCompares);
      }
    }

    auto ifOp =
        builder.create<fir::IfOp>(loc, op.op->getResultTypes(), allCompares,
                                  /*withElse=*/true);
    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());

    LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n");
    mlir::Operation *clonedLoop = op.op->clone();
    bool changed = false;
    for (auto &arg : op.argsAndDims) {
      fir::SequenceType::Shape newShape;
      newShape.push_back(fir::SequenceType::getUnknownExtent());
      auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg.getType());
      mlir::Type arrTy = fir::SequenceType::get(newShape, elementType);
      mlir::Type boxArrTy = fir::BoxType::get(arrTy);
      mlir::Type refArrTy = builder.getRefType(arrTy);
      auto carg = builder.create<fir::ConvertOp>(loc, boxArrTy, arg.arg);
      auto caddr = builder.create<fir::BoxAddrOp>(loc, refArrTy, carg);
      auto insPt = builder.saveInsertionPoint();
      // Use caddr instead of arg.
      clonedLoop->walk([&](mlir::Operation *coop) {
        if (!mlir::isa<fir::CoordinateOp, fir::ArrayCoorOp>(coop))
          return;
        // Reduce the multi-dimensioned index to a single index.
        // This is required becase fir arrays do not support multiple dimensions
        // with unknown dimensions at compile time.
        // We then calculate the multidimensional array like this:
        // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x)
        // where stride is the distance between elements in the dimensions
        // 0, 1 and 2 or x, y and z.
        if (coop->getOperand(0) == arg.arg && coop->getOperands().size() >= 2) {
          builder.setInsertionPoint(coop);
          mlir::Value totalIndex;
          for (unsigned i = arg.rank - 1; i > 0; i--) {
            mlir::Value curIndex =
                builder.createConvert(loc, idxTy, getIndex(builder, coop, i));
            // Multiply by the stride of this array. Later we'll divide by the
            // element size.
            mlir::Value scale =
                builder.createConvert(loc, idxTy, arg.dims[i].getResult(2));
            curIndex =
                builder.create<mlir::arith::MulIOp>(loc, scale, curIndex);
            totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>(
                                            loc, curIndex, totalIndex)
                                      : curIndex;
          }
          // This is the lowest dimension - which doesn't need scaling
          mlir::Value finalIndex =
              builder.createConvert(loc, idxTy, getIndex(builder, coop, 0));
          if (totalIndex) {
            assert(llvm::isPowerOf2_32(arg.size) &&
                   "Expected power of two here");
            unsigned bits = llvm::Log2_32(arg.size);
            mlir::Value elemShift =
                builder.createIntegerConstant(loc, idxTy, bits);
            totalIndex = builder.create<mlir::arith::AddIOp>(
                loc,
                builder.create<mlir::arith::ShRSIOp>(loc, totalIndex,
                                                     elemShift),
                finalIndex);
          } else {
            totalIndex = finalIndex;
          }
          auto newOp = builder.create<fir::CoordinateOp>(
              loc, builder.getRefType(elementType), caddr,
              mlir::ValueRange{totalIndex});
          LLVM_DEBUG(newOp->dump());
          coop->getResult(0).replaceAllUsesWith(newOp->getResult(0));
          coop->erase();
          changed = true;
        }
      });

      builder.restoreInsertionPoint(insPt);
    }
    assert(changed && "Expected operations to have changed");

    builder.insert(clonedLoop);
    // Forward the result(s), if any, from the loop operation to the
    //
    mlir::ResultRange results = clonedLoop->getResults();
    bool hasResults = (results.size() > 0);
    if (hasResults)
      builder.create<fir::ResultOp>(loc, results);

    // Add the original loop in the else-side of the if operation.
    builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
    op.op->replaceAllUsesWith(ifOp);
    op.op->remove();
    builder.insert(op.op);
    // Rely on "cloned loop has results, so original loop also has results".
    if (hasResults) {
      builder.create<fir::ResultOp>(loc, op.op->getResults());
    } else {
      // Use an assert to check this.
      assert(op.op->getResults().size() == 0 &&
             "Weird, the cloned loop doesn't have results, but the original "
             "does?");
    }
  }

  LLVM_DEBUG(llvm::dbgs() << "After transform:\n");
  LLVM_DEBUG(module->dump());

  LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
}
llvm/flang/lib/Optimizer/Transforms/LoopVersioning.cpp