llvm/flang/lib/Optimizer/Transforms/MemoryUtils.cpp

//===- MemoryUtils.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Transforms/MemoryUtils.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/IR/Dominance.h"
#include "llvm/ADT/STLExtras.h"

namespace {
/// Helper class to detect if an alloca is inside an mlir::Block that can be
/// reached again before its deallocation points via block successors. This
/// analysis is only valid if the deallocation points are inside (or nested
/// inside) the same region as alloca because it does not consider region CFG
/// (for instance, the block inside a fir.do_loop is obviously inside a loop,
/// but is not a loop formed by blocks). The dominance of the alloca on its
/// deallocation points implies this pre-condition (although it is more
/// restrictive).
class BlockCycleDetector {
public:
  bool allocaIsInCycle(fir::AllocaOp alloca,
                       llvm::ArrayRef<mlir::Operation *> deallocationPoints);

private:
  // Cache for blocks owning alloca that have been analyzed. In many Fortran
  // programs, allocas are usually made in the same blocks with no block cycles.
  // So getting a fast "no" is beneficial.
  llvm::DenseMap<mlir::Block *, /*isInCycle*/ bool> analyzed;
};
} // namespace

namespace {
class AllocaReplaceImpl {
public:
  AllocaReplaceImpl(fir::AllocaRewriterCallBack allocaRewriter,
                    fir::DeallocCallBack deallocGenerator)
      : allocaRewriter{allocaRewriter}, deallocGenerator{deallocGenerator} {}
  bool replace(mlir::RewriterBase &, fir::AllocaOp);

private:
  mlir::Region *findDeallocationPointsAndOwner(
      fir::AllocaOp alloca,
      llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints);
  bool
  allocDominatesDealloc(fir::AllocaOp alloca,
                        llvm::ArrayRef<mlir::Operation *> deallocationPoints) {
    return llvm::all_of(deallocationPoints, [&](mlir::Operation *deallocPoint) {
      return this->dominanceInfo.properlyDominates(alloca.getOperation(),
                                                   deallocPoint);
    });
  }
  void
  genIndirectDeallocation(mlir::RewriterBase &, fir::AllocaOp,
                          llvm::ArrayRef<mlir::Operation *> deallocationPoints,
                          mlir::Value replacement, mlir::Region &owningRegion);

private:
  fir::AllocaRewriterCallBack allocaRewriter;
  fir::DeallocCallBack deallocGenerator;
  mlir::DominanceInfo dominanceInfo;
  BlockCycleDetector blockCycleDetector;
};
} // namespace

static bool
allocaIsInCycleImpl(mlir::Block *allocaBlock,
                    llvm::ArrayRef<mlir::Operation *> deallocationPoints) {
  llvm::DenseSet<mlir::Block *> seen;
  // Insert the deallocation point blocks as "seen" so that the block
  // traversal will stop at them.
  for (mlir::Operation *deallocPoint : deallocationPoints)
    seen.insert(deallocPoint->getBlock());
  if (seen.contains(allocaBlock))
    return false;
  // Traverse the block successor graph starting by the alloca block.
  llvm::SmallVector<mlir::Block *> successors{allocaBlock};
  while (!successors.empty())
    for (mlir::Block *next : successors.pop_back_val()->getSuccessors()) {
      if (next == allocaBlock)
        return true;
      if (auto pair = seen.insert(next); pair.second)
        successors.push_back(next);
    }
  // The traversal did not reach the alloca block again.
  return false;
}
bool BlockCycleDetector::allocaIsInCycle(
    fir::AllocaOp alloca,
    llvm::ArrayRef<mlir::Operation *> deallocationPoints) {
  mlir::Block *allocaBlock = alloca->getBlock();
  auto analyzedPair = analyzed.try_emplace(allocaBlock, /*isInCycle=*/false);
  bool alreadyAnalyzed = !analyzedPair.second;
  bool &isInCycle = analyzedPair.first->second;
  // Fast exit if block was already analyzed and no cycle was found.
  if (alreadyAnalyzed && !isInCycle)
    return false;
  // If the analysis was not done generically for this block, run it and
  // save the result.
  if (!alreadyAnalyzed)
    isInCycle = allocaIsInCycleImpl(allocaBlock, /*deallocationPoints*/ {});
  if (!isInCycle)
    return false;
  // If the generic analysis found a block loop, see if the deallocation
  // point would be reached before reaching the block again. Do not
  // cache that analysis that is specific to the deallocation points
  // found for this alloca.
  return allocaIsInCycleImpl(allocaBlock, deallocationPoints);
}

static bool terminatorYieldsMemory(mlir::Operation &terminator) {
  return llvm::any_of(terminator.getResults(), [](mlir::OpResult res) {
    return fir::conformsWithPassByRef(res.getType());
  });
}

static bool isRegionTerminator(mlir::Operation &terminator) {
  // Using ReturnLike trait is tempting but it is not set on
  // all region terminator that matters (like omp::TerminatorOp that
  // has no results).
  // May be true for dead code. It is not a correctness issue and dead code can
  // be eliminated by running region simplification before this utility is
  // used.
  // May also be true for unreachable like terminators (e.g., after an abort
  // call related to Fortran STOP). This is also OK, the inserted deallocation
  // will simply never be reached. It is easier for the rest of the code here
  // to assume there is always at least one deallocation point, so keep
  // unreachable terminators.
  return !terminator.hasSuccessors();
}

mlir::Region *AllocaReplaceImpl::findDeallocationPointsAndOwner(
    fir::AllocaOp alloca,
    llvm::SmallVectorImpl<mlir::Operation *> &deallocationPoints) {
  // Step 1: Identify the operation and region owning the alloca.
  mlir::Region *owningRegion = alloca.getOwnerRegion();
  if (!owningRegion)
    return nullptr;
  mlir::Operation *owningOp = owningRegion->getParentOp();
  assert(owningOp && "region expected to be owned");
  // Step 2: Identify the exit points of the owning region, they are the default
  // deallocation points. TODO: detect and use lifetime markers to get earlier
  // deallocation points.
  bool isOpenACCMPRecipe = mlir::isa<mlir::accomp::RecipeInterface>(owningOp);
  for (mlir::Block &block : owningRegion->getBlocks())
    if (mlir::Operation *terminator = block.getTerminator();
        isRegionTerminator(*terminator)) {
      // FIXME: OpenACC and OpenMP privatization recipe are stand alone
      // operation meant to be later "inlined", the value they return may
      // be the address of a local alloca. It would be incorrect to insert
      // deallocation before the terminator (this would introduce use after
      // free once the recipe is inlined.
      // This probably require redesign or special handling on the OpenACC/MP
      // side.
      if (isOpenACCMPRecipe && terminatorYieldsMemory(*terminator))
        return nullptr;
      deallocationPoints.push_back(terminator);
    }
  // If no block terminators without successors have been found, this is
  // an odd region we cannot reason about (never seen yet in FIR and
  // mainstream dialects, but MLIR does not really prevent it).
  if (deallocationPoints.empty())
    return nullptr;

  // Step 3: detect block based loops between the allocation and deallocation
  // points, and add a deallocation point on the back edge to avoid memory
  // leaks.
  // The detection avoids doing region CFG analysis by assuming that there may
  // be cycles if deallocation points are not dominated by the alloca.
  // This leaves the cases where the deallocation points are in the same region
  // as the alloca (or nested inside it). In which cases there may be a back
  // edge between the alloca and the deallocation point via block successors. An
  // analysis is run to detect those cases.
  // When a loop is detected, the easiest solution to deallocate on the back
  // edge is to store the allocated memory address in a variable (that dominates
  // the loops) and to deallocate the address in that variable if it is set
  // before executing the allocation. This strategy still leads to correct
  // execution in the "false positive" cases.
  // Hence, the alloca is added as a deallocation point when there is no
  // dominance. Note that bringing lifetime markers above will reduce the
  // false positives.
  if (!allocDominatesDealloc(alloca, deallocationPoints) ||
      blockCycleDetector.allocaIsInCycle(alloca, deallocationPoints))
    deallocationPoints.push_back(alloca.getOperation());
  return owningRegion;
}

void AllocaReplaceImpl::genIndirectDeallocation(
    mlir::RewriterBase &rewriter, fir::AllocaOp alloca,
    llvm::ArrayRef<mlir::Operation *> deallocationPoints,
    mlir::Value replacement, mlir::Region &owningRegion) {
  mlir::Location loc = alloca.getLoc();
  auto replacementInsertPoint = rewriter.saveInsertionPoint();
  // Create C pointer variable in the entry block to store the alloc
  // and access it indirectly in the entry points that do not dominate.
  rewriter.setInsertionPointToStart(&owningRegion.front());
  mlir::Type heapType = fir::HeapType::get(alloca.getInType());
  mlir::Value ptrVar = rewriter.create<fir::AllocaOp>(loc, heapType);
  mlir::Value nullPtr = rewriter.create<fir::ZeroOp>(loc, heapType);
  rewriter.create<fir::StoreOp>(loc, nullPtr, ptrVar);
  // TODO: introducing a pointer compare op in FIR would help
  // generating less IR here.
  mlir::Type intPtrTy = fir::getIntPtrType(rewriter);
  mlir::Value c0 = rewriter.create<mlir::arith::ConstantOp>(
      loc, intPtrTy, rewriter.getIntegerAttr(intPtrTy, 0));

  // Store new storage address right after its creation.
  rewriter.restoreInsertionPoint(replacementInsertPoint);
  mlir::Value castReplacement =
      fir::factory::createConvert(rewriter, loc, heapType, replacement);
  rewriter.create<fir::StoreOp>(loc, castReplacement, ptrVar);

  // Generate conditional deallocation at every deallocation point.
  auto genConditionalDealloc = [&](mlir::Location loc) {
    mlir::Value ptrVal = rewriter.create<fir::LoadOp>(loc, ptrVar);
    mlir::Value ptrToInt =
        rewriter.create<fir::ConvertOp>(loc, intPtrTy, ptrVal);
    mlir::Value isAllocated = rewriter.create<mlir::arith::CmpIOp>(
        loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0);
    auto ifOp = rewriter.create<fir::IfOp>(loc, std::nullopt, isAllocated,
                                           /*withElseRegion=*/false);
    rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
    mlir::Value cast = fir::factory::createConvert(
        rewriter, loc, replacement.getType(), ptrVal);
    deallocGenerator(loc, rewriter, cast);
    // Currently there is no need to reset the pointer var because two
    // deallocation points can never be reached without going through the
    // alloca.
    rewriter.setInsertionPointAfter(ifOp);
  };
  for (mlir::Operation *deallocPoint : deallocationPoints) {
    rewriter.setInsertionPoint(deallocPoint);
    genConditionalDealloc(deallocPoint->getLoc());
  }
}

bool AllocaReplaceImpl::replace(mlir::RewriterBase &rewriter,
                                fir::AllocaOp alloca) {
  llvm::SmallVector<mlir::Operation *> deallocationPoints;
  mlir::Region *owningRegion =
      findDeallocationPointsAndOwner(alloca, deallocationPoints);
  if (!owningRegion)
    return false;
  rewriter.setInsertionPointAfter(alloca.getOperation());
  bool deallocPointsDominateAlloc =
      allocDominatesDealloc(alloca, deallocationPoints);
  if (mlir::Value replacement =
          allocaRewriter(rewriter, alloca, deallocPointsDominateAlloc)) {
    mlir::Value castReplacement = fir::factory::createConvert(
        rewriter, alloca.getLoc(), alloca.getType(), replacement);
    if (deallocPointsDominateAlloc)
      for (mlir::Operation *deallocPoint : deallocationPoints) {
        rewriter.setInsertionPoint(deallocPoint);
        deallocGenerator(deallocPoint->getLoc(), rewriter, replacement);
      }
    else
      genIndirectDeallocation(rewriter, alloca, deallocationPoints, replacement,
                              *owningRegion);
    rewriter.replaceOp(alloca, castReplacement);
  }
  return true;
}

bool fir::replaceAllocas(mlir::RewriterBase &rewriter,
                         mlir::Operation *parentOp,
                         MustRewriteCallBack mustReplace,
                         AllocaRewriterCallBack allocaRewriter,
                         DeallocCallBack deallocGenerator) {
  // If the parent operation is not an alloca owner, the code below would risk
  // modifying IR outside of parentOp.
  if (!fir::AllocaOp::ownsNestedAlloca(parentOp))
    return false;
  auto insertPoint = rewriter.saveInsertionPoint();
  bool replacedAllRequestedAlloca = true;
  AllocaReplaceImpl impl(allocaRewriter, deallocGenerator);
  parentOp->walk([&](fir::AllocaOp alloca) {
    if (mustReplace(alloca))
      replacedAllRequestedAlloca &= impl.replace(rewriter, alloca);
  });
  rewriter.restoreInsertionPoint(insertPoint);
  return replacedAllRequestedAlloca;
}