llvm/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp

//===-- ArrayValueCopy.cpp ------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Factory.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/Debug.h"

namespace fir {
#define GEN_PASS_DEF_ARRAYVALUECOPY
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

#define DEBUG_TYPE "flang-array-value-copy"

using namespace fir;
using namespace mlir;

using OperationUseMapT = llvm::DenseMap<mlir::Operation *, mlir::Operation *>;

namespace {

/// Array copy analysis.
/// Perform an interference analysis between array values.
///
/// Lowering will generate a sequence of the following form.
/// ```mlir
///   %a_1 = fir.array_load %array_1(%shape) : ...
///   ...
///   %a_j = fir.array_load %array_j(%shape) : ...
///   ...
///   %a_n = fir.array_load %array_n(%shape) : ...
///     ...
///     %v_i = fir.array_fetch %a_i, ...
///     %a_j1 = fir.array_update %a_j, ...
///     ...
///   fir.array_merge_store %a_j, %a_jn to %array_j : ...
/// ```
///
/// The analysis is to determine if there are any conflicts. A conflict is when
/// one the following cases occurs.
///
/// 1. There is an `array_update` to an array value, a_j, such that a_j was
/// loaded from the same array memory reference (array_j) but with a different
/// shape as the other array values a_i, where i != j. [Possible overlapping
/// arrays.]
///
/// 2. There is either an array_fetch or array_update of a_j with a different
/// set of index values. [Possible loop-carried dependence.]
///
/// If none of the array values overlap in storage and the accesses are not
/// loop-carried, then the arrays are conflict-free and no copies are required.
class ArrayCopyAnalysisBase {
public:
  using ConflictSetT = llvm::SmallPtrSet<mlir::Operation *, 16>;
  using UseSetT = llvm::SmallPtrSet<mlir::OpOperand *, 8>;
  using LoadMapSetsT = llvm::DenseMap<mlir::Operation *, UseSetT>;
  using AmendAccessSetT = llvm::SmallPtrSet<mlir::Operation *, 4>;

  ArrayCopyAnalysisBase(mlir::Operation *op, bool optimized)
      : operation{op}, optimizeConflicts(optimized) {
    construct(op);
  }
  virtual ~ArrayCopyAnalysisBase() = default;

  mlir::Operation *getOperation() const { return operation; }

  /// Return true iff the `array_merge_store` has potential conflicts.
  bool hasPotentialConflict(mlir::Operation *op) const {
    LLVM_DEBUG(llvm::dbgs()
               << "looking for a conflict on " << *op
               << " and the set has a total of " << conflicts.size() << '\n');
    return conflicts.contains(op);
  }

  /// Return the use map.
  /// The use map maps array access, amend, fetch and update operations back to
  /// the array load that is the original source of the array value.
  /// It maps an array_load to an array_merge_store, if and only if the loaded
  /// array value has pending modifications to be merged.
  const OperationUseMapT &getUseMap() const { return useMap; }

  /// Return the set of array_access ops directly associated with array_amend
  /// ops.
  bool inAmendAccessSet(mlir::Operation *op) const {
    return amendAccesses.count(op);
  }

  /// For ArrayLoad `load`, return the transitive set of all OpOperands.
  UseSetT getLoadUseSet(mlir::Operation *load) const {
    assert(loadMapSets.count(load) && "analysis missed an array load?");
    return loadMapSets.lookup(load);
  }

  void arrayMentions(llvm::SmallVectorImpl<mlir::Operation *> &mentions,
                     ArrayLoadOp load);

private:
  void construct(mlir::Operation *topLevelOp);

  mlir::Operation *operation; // operation that analysis ran upon
  ConflictSetT conflicts;     // set of conflicts (loads and merge stores)
  OperationUseMapT useMap;
  LoadMapSetsT loadMapSets;
  // Set of array_access ops associated with array_amend ops.
  AmendAccessSetT amendAccesses;
  bool optimizeConflicts;
};

// Optimized array copy analysis that takes into account Fortran
// variable attributes to prove that no conflict is possible
// and reduce the number of temporary arrays.
class ArrayCopyAnalysisOptimized : public ArrayCopyAnalysisBase {
public:
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ArrayCopyAnalysisOptimized)

  ArrayCopyAnalysisOptimized(mlir::Operation *op)
      : ArrayCopyAnalysisBase(op, /*optimized=*/true) {}
};

// Unoptimized array copy analysis used at O0.
class ArrayCopyAnalysis : public ArrayCopyAnalysisBase {
public:
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ArrayCopyAnalysis)

  ArrayCopyAnalysis(mlir::Operation *op)
      : ArrayCopyAnalysisBase(op, /*optimized=*/false) {}
};
} // namespace

namespace {
/// Helper class to collect all array operations that produced an array value.
class ReachCollector {
public:
  ReachCollector(llvm::SmallVectorImpl<mlir::Operation *> &reach,
                 mlir::Region *loopRegion)
      : reach{reach}, loopRegion{loopRegion} {}

  void collectArrayMentionFrom(mlir::Operation *op, mlir::ValueRange range) {
    if (range.empty()) {
      collectArrayMentionFrom(op, mlir::Value{});
      return;
    }
    for (mlir::Value v : range)
      collectArrayMentionFrom(v);
  }

  // Collect all the array_access ops in `block`. This recursively looks into
  // blocks in ops with regions.
  // FIXME: This is temporarily relying on the array_amend appearing in a
  // do_loop Region.  This phase ordering assumption can be eliminated by using
  // dominance information to find the array_access ops or by scanning the
  // transitive closure of the amending array_access's users and the defs that
  // reach them.
  void collectAccesses(llvm::SmallVector<ArrayAccessOp> &result,
                       mlir::Block *block) {
    for (auto &op : *block) {
      if (auto access = mlir::dyn_cast<ArrayAccessOp>(op)) {
        LLVM_DEBUG(llvm::dbgs() << "adding access: " << access << '\n');
        result.push_back(access);
        continue;
      }
      for (auto &region : op.getRegions())
        for (auto &bb : region.getBlocks())
          collectAccesses(result, &bb);
    }
  }

  void collectArrayMentionFrom(mlir::Operation *op, mlir::Value val) {
    // `val` is defined by an Op, process the defining Op.
    // If `val` is defined by a region containing Op, we want to drill down
    // and through that Op's region(s).
    LLVM_DEBUG(llvm::dbgs() << "popset: " << *op << '\n');
    auto popFn = [&](auto rop) {
      assert(val && "op must have a result value");
      auto resNum = mlir::cast<mlir::OpResult>(val).getResultNumber();
      llvm::SmallVector<mlir::Value> results;
      rop.resultToSourceOps(results, resNum);
      for (auto u : results)
        collectArrayMentionFrom(u);
    };
    if (auto rop = mlir::dyn_cast<DoLoopOp>(op)) {
      popFn(rop);
      return;
    }
    if (auto rop = mlir::dyn_cast<IterWhileOp>(op)) {
      popFn(rop);
      return;
    }
    if (auto rop = mlir::dyn_cast<fir::IfOp>(op)) {
      popFn(rop);
      return;
    }
    if (auto box = mlir::dyn_cast<EmboxOp>(op)) {
      for (auto *user : box.getMemref().getUsers())
        if (user != op)
          collectArrayMentionFrom(user, user->getResults());
      return;
    }
    if (auto mergeStore = mlir::dyn_cast<ArrayMergeStoreOp>(op)) {
      if (opIsInsideLoops(mergeStore))
        collectArrayMentionFrom(mergeStore.getSequence());
      return;
    }

    if (mlir::isa<AllocaOp, AllocMemOp>(op)) {
      // Look for any stores inside the loops, and collect an array operation
      // that produced the value being stored to it.
      for (auto *user : op->getUsers())
        if (auto store = mlir::dyn_cast<fir::StoreOp>(user))
          if (opIsInsideLoops(store))
            collectArrayMentionFrom(store.getValue());
      return;
    }

    // Scan the uses of amend's memref
    if (auto amend = mlir::dyn_cast<ArrayAmendOp>(op)) {
      reach.push_back(op);
      llvm::SmallVector<ArrayAccessOp> accesses;
      collectAccesses(accesses, op->getBlock());
      for (auto access : accesses)
        collectArrayMentionFrom(access.getResult());
    }

    // Otherwise, Op does not contain a region so just chase its operands.
    if (mlir::isa<ArrayAccessOp, ArrayLoadOp, ArrayUpdateOp, ArrayModifyOp,
                  ArrayFetchOp>(op)) {
      LLVM_DEBUG(llvm::dbgs() << "add " << *op << " to reachable set\n");
      reach.push_back(op);
    }

    // Include all array_access ops using an array_load.
    if (auto arrLd = mlir::dyn_cast<ArrayLoadOp>(op))
      for (auto *user : arrLd.getResult().getUsers())
        if (mlir::isa<ArrayAccessOp>(user)) {
          LLVM_DEBUG(llvm::dbgs() << "add " << *user << " to reachable set\n");
          reach.push_back(user);
        }

    // Array modify assignment is performed on the result. So the analysis must
    // look at the what is done with the result.
    if (mlir::isa<ArrayModifyOp>(op))
      for (auto *user : op->getResult(0).getUsers())
        followUsers(user);

    if (mlir::isa<fir::CallOp>(op)) {
      LLVM_DEBUG(llvm::dbgs() << "add " << *op << " to reachable set\n");
      reach.push_back(op);
    }

    for (auto u : op->getOperands())
      collectArrayMentionFrom(u);
  }

  void collectArrayMentionFrom(mlir::BlockArgument ba) {
    auto *parent = ba.getOwner()->getParentOp();
    // If inside an Op holding a region, the block argument corresponds to an
    // argument passed to the containing Op.
    auto popFn = [&](auto rop) {
      collectArrayMentionFrom(rop.blockArgToSourceOp(ba.getArgNumber()));
    };
    if (auto rop = mlir::dyn_cast<DoLoopOp>(parent)) {
      popFn(rop);
      return;
    }
    if (auto rop = mlir::dyn_cast<IterWhileOp>(parent)) {
      popFn(rop);
      return;
    }
    // Otherwise, a block argument is provided via the pred blocks.
    for (auto *pred : ba.getOwner()->getPredecessors()) {
      auto u = pred->getTerminator()->getOperand(ba.getArgNumber());
      collectArrayMentionFrom(u);
    }
  }

  // Recursively trace operands to find all array operations relating to the
  // values merged.
  void collectArrayMentionFrom(mlir::Value val) {
    if (!val || visited.contains(val))
      return;
    visited.insert(val);

    // Process a block argument.
    if (auto ba = mlir::dyn_cast<mlir::BlockArgument>(val)) {
      collectArrayMentionFrom(ba);
      return;
    }

    // Process an Op.
    if (auto *op = val.getDefiningOp()) {
      collectArrayMentionFrom(op, val);
      return;
    }

    emitFatalError(val.getLoc(), "unhandled value");
  }

  /// Return all ops that produce the array value that is stored into the
  /// `array_merge_store`.
  static void reachingValues(llvm::SmallVectorImpl<mlir::Operation *> &reach,
                             mlir::Value seq) {
    reach.clear();
    mlir::Region *loopRegion = nullptr;
    if (auto doLoop = mlir::dyn_cast_or_null<DoLoopOp>(seq.getDefiningOp()))
      loopRegion = &doLoop->getRegion(0);
    ReachCollector collector(reach, loopRegion);
    collector.collectArrayMentionFrom(seq);
  }

private:
  /// Is \op inside the loop nest region ?
  /// FIXME: replace this structural dependence with graph properties.
  bool opIsInsideLoops(mlir::Operation *op) const {
    auto *region = op->getParentRegion();
    while (region) {
      if (region == loopRegion)
        return true;
      region = region->getParentRegion();
    }
    return false;
  }

  /// Recursively trace the use of an operation results, calling
  /// collectArrayMentionFrom on the direct and indirect user operands.
  void followUsers(mlir::Operation *op) {
    for (auto userOperand : op->getOperands())
      collectArrayMentionFrom(userOperand);
    // Go through potential converts/coordinate_op.
    for (auto indirectUser : op->getUsers())
      followUsers(indirectUser);
  }

  llvm::SmallVectorImpl<mlir::Operation *> &reach;
  llvm::SmallPtrSet<mlir::Value, 16> visited;
  /// Region of the loops nest that produced the array value.
  mlir::Region *loopRegion;
};
} // namespace

/// Find all the array operations that access the array value that is loaded by
/// the array load operation, `load`.
void ArrayCopyAnalysisBase::arrayMentions(
    llvm::SmallVectorImpl<mlir::Operation *> &mentions, ArrayLoadOp load) {
  mentions.clear();
  auto lmIter = loadMapSets.find(load);
  if (lmIter != loadMapSets.end()) {
    for (auto *opnd : lmIter->second) {
      auto *owner = opnd->getOwner();
      if (mlir::isa<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp, ArrayUpdateOp,
                    ArrayModifyOp>(owner))
        mentions.push_back(owner);
    }
    return;
  }

  UseSetT visited;
  llvm::SmallVector<mlir::OpOperand *> queue; // uses of ArrayLoad[orig]

  auto appendToQueue = [&](mlir::Value val) {
    for (auto &use : val.getUses())
      if (!visited.count(&use)) {
        visited.insert(&use);
        queue.push_back(&use);
      }
  };

  // Build the set of uses of `original`.
  // let USES = { uses of original fir.load }
  appendToQueue(load);

  // Process the worklist until done.
  while (!queue.empty()) {
    mlir::OpOperand *operand = queue.pop_back_val();
    mlir::Operation *owner = operand->getOwner();
    if (!owner)
      continue;
    auto structuredLoop = [&](auto ro) {
      if (auto blockArg = ro.iterArgToBlockArg(operand->get())) {
        int64_t arg = blockArg.getArgNumber();
        mlir::Value output = ro.getResult(ro.getFinalValue() ? arg : arg - 1);
        appendToQueue(output);
        appendToQueue(blockArg);
      }
    };
    // TODO: this need to be updated to use the control-flow interface.
    auto branchOp = [&](mlir::Block *dest, OperandRange operands) {
      if (operands.empty())
        return;

      // Check if this operand is within the range.
      unsigned operandIndex = operand->getOperandNumber();
      unsigned operandsStart = operands.getBeginOperandIndex();
      if (operandIndex < operandsStart ||
          operandIndex >= (operandsStart + operands.size()))
        return;

      // Index the successor.
      unsigned argIndex = operandIndex - operandsStart;
      appendToQueue(dest->getArgument(argIndex));
    };
    // Thread uses into structured loop bodies and return value uses.
    if (auto ro = mlir::dyn_cast<DoLoopOp>(owner)) {
      structuredLoop(ro);
    } else if (auto ro = mlir::dyn_cast<IterWhileOp>(owner)) {
      structuredLoop(ro);
    } else if (auto rs = mlir::dyn_cast<ResultOp>(owner)) {
      // Thread any uses of fir.if that return the marked array value.
      mlir::Operation *parent = rs->getParentRegion()->getParentOp();
      if (auto ifOp = mlir::dyn_cast<fir::IfOp>(parent))
        appendToQueue(ifOp.getResult(operand->getOperandNumber()));
    } else if (mlir::isa<ArrayFetchOp>(owner)) {
      // Keep track of array value fetches.
      LLVM_DEBUG(llvm::dbgs()
                 << "add fetch {" << *owner << "} to array value set\n");
      mentions.push_back(owner);
    } else if (auto update = mlir::dyn_cast<ArrayUpdateOp>(owner)) {
      // Keep track of array value updates and thread the return value uses.
      LLVM_DEBUG(llvm::dbgs()
                 << "add update {" << *owner << "} to array value set\n");
      mentions.push_back(owner);
      appendToQueue(update.getResult());
    } else if (auto update = mlir::dyn_cast<ArrayModifyOp>(owner)) {
      // Keep track of array value modification and thread the return value
      // uses.
      LLVM_DEBUG(llvm::dbgs()
                 << "add modify {" << *owner << "} to array value set\n");
      mentions.push_back(owner);
      appendToQueue(update.getResult(1));
    } else if (auto mention = mlir::dyn_cast<ArrayAccessOp>(owner)) {
      mentions.push_back(owner);
    } else if (auto amend = mlir::dyn_cast<ArrayAmendOp>(owner)) {
      mentions.push_back(owner);
      appendToQueue(amend.getResult());
    } else if (auto br = mlir::dyn_cast<mlir::cf::BranchOp>(owner)) {
      branchOp(br.getDest(), br.getDestOperands());
    } else if (auto br = mlir::dyn_cast<mlir::cf::CondBranchOp>(owner)) {
      branchOp(br.getTrueDest(), br.getTrueOperands());
      branchOp(br.getFalseDest(), br.getFalseOperands());
    } else if (mlir::isa<ArrayMergeStoreOp>(owner)) {
      // do nothing
    } else {
      llvm::report_fatal_error("array value reached unexpected op");
    }
  }
  loadMapSets.insert({load, visited});
}

static bool hasPointerType(mlir::Type type) {
  if (auto boxTy = mlir::dyn_cast<BoxType>(type))
    type = boxTy.getEleTy();
  return mlir::isa<fir::PointerType>(type);
}

// This is a NF performance hack. It makes a simple test that the slices of the
// load, \p ld, and the merge store, \p st, are trivially mutually exclusive.
static bool mutuallyExclusiveSliceRange(ArrayLoadOp ld, ArrayMergeStoreOp st) {
  // If the same array_load, then no further testing is warranted.
  if (ld.getResult() == st.getOriginal())
    return false;

  auto getSliceOp = [](mlir::Value val) -> SliceOp {
    if (!val)
      return {};
    auto sliceOp = mlir::dyn_cast_or_null<SliceOp>(val.getDefiningOp());
    if (!sliceOp)
      return {};
    return sliceOp;
  };

  auto ldSlice = getSliceOp(ld.getSlice());
  auto stSlice = getSliceOp(st.getSlice());
  if (!ldSlice || !stSlice)
    return false;

  // Resign on subobject slices.
  if (!ldSlice.getFields().empty() || !stSlice.getFields().empty() ||
      !ldSlice.getSubstr().empty() || !stSlice.getSubstr().empty())
    return false;

  // Crudely test that the two slices do not overlap by looking for the
  // following general condition. If the slices look like (i:j) and (j+1:k) then
  // these ranges do not overlap. The addend must be a constant.
  auto ldTriples = ldSlice.getTriples();
  auto stTriples = stSlice.getTriples();
  const auto size = ldTriples.size();
  if (size != stTriples.size())
    return false;

  auto displacedByConstant = [](mlir::Value v1, mlir::Value v2) {
    auto removeConvert = [](mlir::Value v) -> mlir::Operation * {
      auto *op = v.getDefiningOp();
      while (auto conv = mlir::dyn_cast_or_null<ConvertOp>(op))
        op = conv.getValue().getDefiningOp();
      return op;
    };

    auto isPositiveConstant = [](mlir::Value v) -> bool {
      if (auto conOp =
              mlir::dyn_cast<mlir::arith::ConstantOp>(v.getDefiningOp()))
        if (auto iattr = mlir::dyn_cast<mlir::IntegerAttr>(conOp.getValue()))
          return iattr.getInt() > 0;
      return false;
    };

    auto *op1 = removeConvert(v1);
    auto *op2 = removeConvert(v2);
    if (!op1 || !op2)
      return false;
    if (auto addi = mlir::dyn_cast<mlir::arith::AddIOp>(op2))
      if ((addi.getLhs().getDefiningOp() == op1 &&
           isPositiveConstant(addi.getRhs())) ||
          (addi.getRhs().getDefiningOp() == op1 &&
           isPositiveConstant(addi.getLhs())))
        return true;
    if (auto subi = mlir::dyn_cast<mlir::arith::SubIOp>(op1))
      if (subi.getLhs().getDefiningOp() == op2 &&
          isPositiveConstant(subi.getRhs()))
        return true;
    return false;
  };

  for (std::remove_const_t<decltype(size)> i = 0; i < size; i += 3) {
    // If both are loop invariant, skip to the next triple.
    if (mlir::isa_and_nonnull<fir::UndefOp>(ldTriples[i + 1].getDefiningOp()) &&
        mlir::isa_and_nonnull<fir::UndefOp>(stTriples[i + 1].getDefiningOp())) {
      // Unless either is a vector index, then be conservative.
      if (mlir::isa_and_nonnull<fir::UndefOp>(ldTriples[i].getDefiningOp()) ||
          mlir::isa_and_nonnull<fir::UndefOp>(stTriples[i].getDefiningOp()))
        return false;
      continue;
    }
    // If identical, skip to the next triple.
    if (ldTriples[i] == stTriples[i] && ldTriples[i + 1] == stTriples[i + 1] &&
        ldTriples[i + 2] == stTriples[i + 2])
      continue;
    // If ubound and lbound are the same with a constant offset, skip to the
    // next triple.
    if (displacedByConstant(ldTriples[i + 1], stTriples[i]) ||
        displacedByConstant(stTriples[i + 1], ldTriples[i]))
      continue;
    return false;
  }
  LLVM_DEBUG(llvm::dbgs() << "detected non-overlapping slice ranges on " << ld
                          << " and " << st << ", which is not a conflict\n");
  return true;
}

/// Is there a conflict between the array value that was updated and to be
/// stored to `st` and the set of arrays loaded (`reach`) and used to compute
/// the updated value?
/// If `optimize` is true, use the variable attributes to prove that
/// there is no conflict.
static bool conflictOnLoad(llvm::ArrayRef<mlir::Operation *> reach,
                           ArrayMergeStoreOp st, bool optimize) {
  mlir::Value load;
  mlir::Value addr = st.getMemref();
  const bool storeHasPointerType = hasPointerType(addr.getType());
  for (auto *op : reach)
    if (auto ld = mlir::dyn_cast<ArrayLoadOp>(op)) {
      mlir::Type ldTy = ld.getMemref().getType();
      auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
                                              ld.getContext());
      if (ld.getMemref() == addr) {
        if (mutuallyExclusiveSliceRange(ld, st))
          continue;
        if (ld.getResult() != st.getOriginal())
          return true;
        if (load) {
          // TODO: extend this to allow checking if the first `load` and this
          // `ld` are mutually exclusive accesses but not identical.
          return true;
        }
        load = ld;
      } else if (storeHasPointerType) {
        if (optimize && !hasPointerType(ldTy) &&
            !valueMayHaveFirAttributes(
                ld.getMemref(),
                {getTargetAttrName(),
                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
          continue;

        return true;
      } else if (hasPointerType(ldTy)) {
        if (optimize && !storeHasPointerType &&
            !valueMayHaveFirAttributes(
                addr,
                {getTargetAttrName(),
                 fir::GlobalOp::getTargetAttrName(globalOpName).strref()}))
          continue;

        return true;
      }
      // TODO: Check if types can also allow ruling out some cases. For now,
      // the fact that equivalences is using pointer attribute to enforce
      // aliasing is preventing any attempt to do so, and in general, it may
      // be wrong to use this if any of the types is a complex or a derived
      // for which it is possible to create a pointer to a part with a
      // different type than the whole, although this deserve some more
      // investigation because existing compiler behavior seem to diverge
      // here.
    }
  return false;
}

/// Is there an access vector conflict on the array being merged into? If the
/// access vectors diverge, then assume that there are potentially overlapping
/// loop-carried references.
static bool conflictOnMerge(llvm::ArrayRef<mlir::Operation *> mentions) {
  if (mentions.size() < 2)
    return false;
  llvm::SmallVector<mlir::Value> indices;
  LLVM_DEBUG(llvm::dbgs() << "check merge conflict on with " << mentions.size()
                          << " mentions on the list\n");
  bool valSeen = false;
  bool refSeen = false;
  for (auto *op : mentions) {
    llvm::SmallVector<mlir::Value> compareVector;
    if (auto u = mlir::dyn_cast<ArrayUpdateOp>(op)) {
      valSeen = true;
      if (indices.empty()) {
        indices = u.getIndices();
        continue;
      }
      compareVector = u.getIndices();
    } else if (auto f = mlir::dyn_cast<ArrayModifyOp>(op)) {
      valSeen = true;
      if (indices.empty()) {
        indices = f.getIndices();
        continue;
      }
      compareVector = f.getIndices();
    } else if (auto f = mlir::dyn_cast<ArrayFetchOp>(op)) {
      valSeen = true;
      if (indices.empty()) {
        indices = f.getIndices();
        continue;
      }
      compareVector = f.getIndices();
    } else if (auto f = mlir::dyn_cast<ArrayAccessOp>(op)) {
      refSeen = true;
      if (indices.empty()) {
        indices = f.getIndices();
        continue;
      }
      compareVector = f.getIndices();
    } else if (mlir::isa<ArrayAmendOp>(op)) {
      refSeen = true;
      continue;
    } else {
      mlir::emitError(op->getLoc(), "unexpected operation in analysis");
    }
    if (compareVector.size() != indices.size() ||
        llvm::any_of(llvm::zip(compareVector, indices), [&](auto pair) {
          return std::get<0>(pair) != std::get<1>(pair);
        }))
      return true;
    LLVM_DEBUG(llvm::dbgs() << "vectors compare equal\n");
  }
  return valSeen && refSeen;
}

/// With element-by-reference semantics, an amended array with more than once
/// access to the same loaded array are conservatively considered a conflict.
/// Note: the array copy can still be eliminated in subsequent optimizations.
static bool conflictOnReference(llvm::ArrayRef<mlir::Operation *> mentions) {
  LLVM_DEBUG(llvm::dbgs() << "checking reference semantics " << mentions.size()
                          << '\n');
  if (mentions.size() < 3)
    return false;
  unsigned amendCount = 0;
  unsigned accessCount = 0;
  for (auto *op : mentions) {
    if (mlir::isa<ArrayAmendOp>(op) && ++amendCount > 1) {
      LLVM_DEBUG(llvm::dbgs() << "conflict: multiple amends of array value\n");
      return true;
    }
    if (mlir::isa<ArrayAccessOp>(op) && ++accessCount > 1) {
      LLVM_DEBUG(llvm::dbgs()
                 << "conflict: multiple accesses of array value\n");
      return true;
    }
    if (mlir::isa<ArrayFetchOp, ArrayUpdateOp, ArrayModifyOp>(op)) {
      LLVM_DEBUG(llvm::dbgs()
                 << "conflict: array value has both uses by-value and uses "
                    "by-reference. conservative assumption.\n");
      return true;
    }
  }
  return false;
}

static mlir::Operation *
amendingAccess(llvm::ArrayRef<mlir::Operation *> mentions) {
  for (auto *op : mentions)
    if (auto amend = mlir::dyn_cast<ArrayAmendOp>(op))
      return amend.getMemref().getDefiningOp();
  return {};
}

// Are any conflicts present? The conflicts detected here are described above.
static bool conflictDetected(llvm::ArrayRef<mlir::Operation *> reach,
                             llvm::ArrayRef<mlir::Operation *> mentions,
                             ArrayMergeStoreOp st, bool optimize) {
  return conflictOnLoad(reach, st, optimize) || conflictOnMerge(mentions);
}

// Assume that any call to a function that uses host-associations will be
// modifying the output array.
static bool
conservativeCallConflict(llvm::ArrayRef<mlir::Operation *> reaches) {
  return llvm::any_of(reaches, [](mlir::Operation *op) {
    if (auto call = mlir::dyn_cast<fir::CallOp>(op))
      if (auto callee = mlir::dyn_cast<mlir::SymbolRefAttr>(
              call.getCallableForCallee())) {
        auto module = op->getParentOfType<mlir::ModuleOp>();
        return isInternalProcedure(
            module.lookupSymbol<mlir::func::FuncOp>(callee));
      }
    return false;
  });
}

/// Constructor of the array copy analysis.
/// This performs the analysis and saves the intermediate results.
void ArrayCopyAnalysisBase::construct(mlir::Operation *topLevelOp) {
  topLevelOp->walk([&](Operation *op) {
    if (auto st = mlir::dyn_cast<fir::ArrayMergeStoreOp>(op)) {
      llvm::SmallVector<mlir::Operation *> values;
      ReachCollector::reachingValues(values, st.getSequence());
      bool callConflict = conservativeCallConflict(values);
      llvm::SmallVector<mlir::Operation *> mentions;
      arrayMentions(mentions,
                    mlir::cast<ArrayLoadOp>(st.getOriginal().getDefiningOp()));
      bool conflict = conflictDetected(values, mentions, st, optimizeConflicts);
      bool refConflict = conflictOnReference(mentions);
      if (callConflict || conflict || refConflict) {
        LLVM_DEBUG(llvm::dbgs()
                   << "CONFLICT: copies required for " << st << '\n'
                   << "   adding conflicts on: " << *op << " and "
                   << st.getOriginal() << '\n');
        conflicts.insert(op);
        conflicts.insert(st.getOriginal().getDefiningOp());
        if (auto *access = amendingAccess(mentions))
          amendAccesses.insert(access);
      }
      auto *ld = st.getOriginal().getDefiningOp();
      LLVM_DEBUG(llvm::dbgs()
                 << "map: adding {" << *ld << " -> " << st << "}\n");
      useMap.insert({ld, op});
    } else if (auto load = mlir::dyn_cast<ArrayLoadOp>(op)) {
      llvm::SmallVector<mlir::Operation *> mentions;
      arrayMentions(mentions, load);
      LLVM_DEBUG(llvm::dbgs() << "process load: " << load
                              << ", mentions: " << mentions.size() << '\n');
      for (auto *acc : mentions) {
        LLVM_DEBUG(llvm::dbgs() << " mention: " << *acc << '\n');
        if (mlir::isa<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp, ArrayUpdateOp,
                      ArrayModifyOp>(acc)) {
          if (useMap.count(acc)) {
            mlir::emitError(
                load.getLoc(),
                "The parallel semantics of multiple array_merge_stores per "
                "array_load are not supported.");
            continue;
          }
          LLVM_DEBUG(llvm::dbgs()
                     << "map: adding {" << *acc << "} -> {" << load << "}\n");
          useMap.insert({acc, op});
        }
      }
    }
  });
}

//===----------------------------------------------------------------------===//
// Conversions for converting out of array value form.
//===----------------------------------------------------------------------===//

namespace {
class ArrayLoadConversion : public mlir::OpRewritePattern<ArrayLoadOp> {
public:
  using OpRewritePattern::OpRewritePattern;

  llvm::LogicalResult
  matchAndRewrite(ArrayLoadOp load,
                  mlir::PatternRewriter &rewriter) const override {
    LLVM_DEBUG(llvm::dbgs() << "replace load " << load << " with undef.\n");
    rewriter.replaceOpWithNewOp<UndefOp>(load, load.getType());
    return mlir::success();
  }
};

class ArrayMergeStoreConversion
    : public mlir::OpRewritePattern<ArrayMergeStoreOp> {
public:
  using OpRewritePattern::OpRewritePattern;

  llvm::LogicalResult
  matchAndRewrite(ArrayMergeStoreOp store,
                  mlir::PatternRewriter &rewriter) const override {
    LLVM_DEBUG(llvm::dbgs() << "marking store " << store << " as dead.\n");
    rewriter.eraseOp(store);
    return mlir::success();
  }
};
} // namespace

static mlir::Type getEleTy(mlir::Type ty) {
  auto eleTy = unwrapSequenceType(unwrapPassByRefType(ty));
  // FIXME: keep ptr/heap/ref information.
  return ReferenceType::get(eleTy);
}

// This is an unsafe way to deduce this (won't be true in internal
// procedure or inside select-rank for assumed-size). Only here to satisfy
// legacy code until removed.
static bool isAssumedSize(llvm::SmallVectorImpl<mlir::Value> &extents) {
  if (extents.empty())
    return false;
  auto cstLen = fir::getIntIfConstant(extents.back());
  return cstLen.has_value() && *cstLen == -1;
}

// Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
static bool getAdjustedExtents(mlir::Location loc,
                               mlir::PatternRewriter &rewriter,
                               ArrayLoadOp arrLoad,
                               llvm::SmallVectorImpl<mlir::Value> &result,
                               mlir::Value shape) {
  bool copyUsingSlice = false;
  auto *shapeOp = shape.getDefiningOp();
  if (auto s = mlir::dyn_cast_or_null<ShapeOp>(shapeOp)) {
    auto e = s.getExtents();
    result.insert(result.end(), e.begin(), e.end());
  } else if (auto s = mlir::dyn_cast_or_null<ShapeShiftOp>(shapeOp)) {
    auto e = s.getExtents();
    result.insert(result.end(), e.begin(), e.end());
  } else {
    emitFatalError(loc, "not a fir.shape/fir.shape_shift op");
  }
  auto idxTy = rewriter.getIndexType();
  if (isAssumedSize(result)) {
    // Use slice information to compute the extent of the column.
    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
    mlir::Value size = one;
    if (mlir::Value sliceArg = arrLoad.getSlice()) {
      if (auto sliceOp =
              mlir::dyn_cast_or_null<SliceOp>(sliceArg.getDefiningOp())) {
        auto triples = sliceOp.getTriples();
        const std::size_t tripleSize = triples.size();
        auto module = arrLoad->getParentOfType<mlir::ModuleOp>();
        FirOpBuilder builder(rewriter, module);
        size = builder.genExtentFromTriplet(loc, triples[tripleSize - 3],
                                            triples[tripleSize - 2],
                                            triples[tripleSize - 1], idxTy);
        copyUsingSlice = true;
      }
    }
    result[result.size() - 1] = size;
  }
  return copyUsingSlice;
}

/// Place the extents of the array load, \p arrLoad, into \p result and
/// return a ShapeOp or ShapeShiftOp with the same extents. If \p arrLoad is
/// loading a `!fir.box`, code will be generated to read the extents from the
/// boxed value, and the retunred shape Op will be built with the extents read
/// from the box. Otherwise, the extents will be extracted from the ShapeOp (or
/// ShapeShiftOp) argument of \p arrLoad. \p copyUsingSlice will be set to true
/// if slicing of the output array is to be done in the copy-in/copy-out rather
/// than in the elemental computation step.
static mlir::Value getOrReadExtentsAndShapeOp(
    mlir::Location loc, mlir::PatternRewriter &rewriter, ArrayLoadOp arrLoad,
    llvm::SmallVectorImpl<mlir::Value> &result, bool &copyUsingSlice) {
  assert(result.empty());
  if (arrLoad->hasAttr(fir::getOptionalAttrName()))
    fir::emitFatalError(
        loc, "shapes from array load of OPTIONAL arrays must not be used");
  if (auto boxTy = mlir::dyn_cast<BoxType>(arrLoad.getMemref().getType())) {
    auto rank =
        mlir::cast<SequenceType>(dyn_cast_ptrOrBoxEleTy(boxTy)).getDimension();
    auto idxTy = rewriter.getIndexType();
    for (decltype(rank) dim = 0; dim < rank; ++dim) {
      auto dimVal = rewriter.create<mlir::arith::ConstantIndexOp>(loc, dim);
      auto dimInfo = rewriter.create<BoxDimsOp>(loc, idxTy, idxTy, idxTy,
                                                arrLoad.getMemref(), dimVal);
      result.emplace_back(dimInfo.getResult(1));
    }
    if (!arrLoad.getShape()) {
      auto shapeType = ShapeType::get(rewriter.getContext(), rank);
      return rewriter.create<ShapeOp>(loc, shapeType, result);
    }
    auto shiftOp = arrLoad.getShape().getDefiningOp<ShiftOp>();
    auto shapeShiftType = ShapeShiftType::get(rewriter.getContext(), rank);
    llvm::SmallVector<mlir::Value> shapeShiftOperands;
    for (auto [lb, extent] : llvm::zip(shiftOp.getOrigins(), result)) {
      shapeShiftOperands.push_back(lb);
      shapeShiftOperands.push_back(extent);
    }
    return rewriter.create<ShapeShiftOp>(loc, shapeShiftType,
                                         shapeShiftOperands);
  }
  copyUsingSlice =
      getAdjustedExtents(loc, rewriter, arrLoad, result, arrLoad.getShape());
  return arrLoad.getShape();
}

static mlir::Type toRefType(mlir::Type ty) {
  if (fir::isa_ref_type(ty))
    return ty;
  return fir::ReferenceType::get(ty);
}

static llvm::SmallVector<mlir::Value>
getTypeParamsIfRawData(mlir::Location loc, FirOpBuilder &builder,
                       ArrayLoadOp arrLoad, mlir::Type ty) {
  if (mlir::isa<BoxType>(ty))
    return {};
  return fir::factory::getTypeParams(loc, builder, arrLoad);
}

static mlir::Value genCoorOp(mlir::PatternRewriter &rewriter,
                             mlir::Location loc, mlir::Type eleTy,
                             mlir::Type resTy, mlir::Value alloc,
                             mlir::Value shape, mlir::Value slice,
                             mlir::ValueRange indices, ArrayLoadOp load,
                             bool skipOrig = false) {
  llvm::SmallVector<mlir::Value> originated;
  if (skipOrig)
    originated.assign(indices.begin(), indices.end());
  else
    originated = factory::originateIndices(loc, rewriter, alloc.getType(),
                                           shape, indices);
  auto seqTy = dyn_cast_ptrOrBoxEleTy(alloc.getType());
  assert(seqTy && mlir::isa<SequenceType>(seqTy));
  const auto dimension = mlir::cast<SequenceType>(seqTy).getDimension();
  auto module = load->getParentOfType<mlir::ModuleOp>();
  FirOpBuilder builder(rewriter, module);
  auto typeparams = getTypeParamsIfRawData(loc, builder, load, alloc.getType());
  mlir::Value result = rewriter.create<ArrayCoorOp>(
      loc, eleTy, alloc, shape, slice,
      llvm::ArrayRef<mlir::Value>{originated}.take_front(dimension),
      typeparams);
  if (dimension < originated.size())
    result = rewriter.create<fir::CoordinateOp>(
        loc, resTy, result,
        llvm::ArrayRef<mlir::Value>{originated}.drop_front(dimension));
  return result;
}

static mlir::Value getCharacterLen(mlir::Location loc, FirOpBuilder &builder,
                                   ArrayLoadOp load, CharacterType charTy) {
  auto charLenTy = builder.getCharacterLengthType();
  if (charTy.hasDynamicLen()) {
    if (mlir::isa<BoxType>(load.getMemref().getType())) {
      // The loaded array is an emboxed value. Get the CHARACTER length from
      // the box value.
      auto eleSzInBytes =
          builder.create<BoxEleSizeOp>(loc, charLenTy, load.getMemref());
      auto kindSize =
          builder.getKindMap().getCharacterBitsize(charTy.getFKind());
      auto kindByteSize =
          builder.createIntegerConstant(loc, charLenTy, kindSize / 8);
      return builder.create<mlir::arith::DivSIOp>(loc, eleSzInBytes,
                                                  kindByteSize);
    }
    // The loaded array is a (set of) unboxed values. If the CHARACTER's
    // length is not a constant, it must be provided as a type parameter to
    // the array_load.
    auto typeparams = load.getTypeparams();
    assert(typeparams.size() > 0 && "expected type parameters on array_load");
    return typeparams.back();
  }
  // The typical case: the length of the CHARACTER is a compile-time
  // constant that is encoded in the type information.
  return builder.createIntegerConstant(loc, charLenTy, charTy.getLen());
}
/// Generate a shallow array copy. This is used for both copy-in and copy-out.
template <bool CopyIn>
void genArrayCopy(mlir::Location loc, mlir::PatternRewriter &rewriter,
                  mlir::Value dst, mlir::Value src, mlir::Value shapeOp,
                  mlir::Value sliceOp, ArrayLoadOp arrLoad) {
  auto insPt = rewriter.saveInsertionPoint();
  llvm::SmallVector<mlir::Value> indices;
  llvm::SmallVector<mlir::Value> extents;
  bool copyUsingSlice =
      getAdjustedExtents(loc, rewriter, arrLoad, extents, shapeOp);
  auto idxTy = rewriter.getIndexType();
  // Build loop nest from column to row.
  for (auto sh : llvm::reverse(extents)) {
    auto ubi = rewriter.create<ConvertOp>(loc, idxTy, sh);
    auto zero = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 0);
    auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
    auto ub = rewriter.create<mlir::arith::SubIOp>(loc, idxTy, ubi, one);
    auto loop = rewriter.create<DoLoopOp>(loc, zero, ub, one);
    rewriter.setInsertionPointToStart(loop.getBody());
    indices.push_back(loop.getInductionVar());
  }
  // Reverse the indices so they are in column-major order.
  std::reverse(indices.begin(), indices.end());
  auto module = arrLoad->getParentOfType<mlir::ModuleOp>();
  FirOpBuilder builder(rewriter, module);
  auto fromAddr = rewriter.create<ArrayCoorOp>(
      loc, getEleTy(src.getType()), src, shapeOp,
      CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
      factory::originateIndices(loc, rewriter, src.getType(), shapeOp, indices),
      getTypeParamsIfRawData(loc, builder, arrLoad, src.getType()));
  auto toAddr = rewriter.create<ArrayCoorOp>(
      loc, getEleTy(dst.getType()), dst, shapeOp,
      !CopyIn && copyUsingSlice ? sliceOp : mlir::Value{},
      factory::originateIndices(loc, rewriter, dst.getType(), shapeOp, indices),
      getTypeParamsIfRawData(loc, builder, arrLoad, dst.getType()));
  auto eleTy = unwrapSequenceType(unwrapPassByRefType(dst.getType()));
  // Copy from (to) object to (from) temp copy of same object.
  if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
    auto len = getCharacterLen(loc, builder, arrLoad, charTy);
    CharBoxValue toChar(toAddr, len);
    CharBoxValue fromChar(fromAddr, len);
    factory::genScalarAssignment(builder, loc, toChar, fromChar);
  } else {
    if (hasDynamicSize(eleTy))
      TODO(loc, "copy element of dynamic size");
    factory::genScalarAssignment(builder, loc, toAddr, fromAddr);
  }
  rewriter.restoreInsertionPoint(insPt);
}

/// The array load may be either a boxed or unboxed value. If the value is
/// boxed, we read the type parameters from the boxed value.
static llvm::SmallVector<mlir::Value>
genArrayLoadTypeParameters(mlir::Location loc, mlir::PatternRewriter &rewriter,
                           ArrayLoadOp load) {
  if (load.getTypeparams().empty()) {
    auto eleTy =
        unwrapSequenceType(unwrapPassByRefType(load.getMemref().getType()));
    if (hasDynamicSize(eleTy)) {
      if (auto charTy = mlir::dyn_cast<CharacterType>(eleTy)) {
        assert(mlir::isa<BoxType>(load.getMemref().getType()));
        auto module = load->getParentOfType<mlir::ModuleOp>();
        FirOpBuilder builder(rewriter, module);
        return {getCharacterLen(loc, builder, load, charTy)};
      }
      TODO(loc, "unhandled dynamic type parameters");
    }
    return {};
  }
  return load.getTypeparams();
}

static llvm::SmallVector<mlir::Value>
findNonconstantExtents(mlir::Type memrefTy,
                       llvm::ArrayRef<mlir::Value> extents) {
  llvm::SmallVector<mlir::Value> nce;
  auto arrTy = unwrapPassByRefType(memrefTy);
  auto seqTy = mlir::cast<SequenceType>(arrTy);
  for (auto [s, x] : llvm::zip(seqTy.getShape(), extents))
    if (s == SequenceType::getUnknownExtent())
      nce.emplace_back(x);
  if (extents.size() > seqTy.getShape().size())
    for (auto x : extents.drop_front(seqTy.getShape().size()))
      nce.emplace_back(x);
  return nce;
}

/// Allocate temporary storage for an ArrayLoadOp \load and initialize any
/// allocatable direct components of the array elements with an unallocated
/// status. Returns the temporary address as well as a callback to generate the
/// temporary clean-up once it has been used. The clean-up will take care of
/// deallocating all the element allocatable components that may have been
/// allocated while using the temporary.
static std::pair<mlir::Value,
                 std::function<void(mlir::PatternRewriter &rewriter)>>
allocateArrayTemp(mlir::Location loc, mlir::PatternRewriter &rewriter,
                  ArrayLoadOp load, llvm::ArrayRef<mlir::Value> extents,
                  mlir::Value shape) {
  mlir::Type baseType = load.getMemref().getType();
  llvm::SmallVector<mlir::Value> nonconstantExtents =
      findNonconstantExtents(baseType, extents);
  llvm::SmallVector<mlir::Value> typeParams =
      genArrayLoadTypeParameters(loc, rewriter, load);
  mlir::Value allocmem = rewriter.create<AllocMemOp>(
      loc, dyn_cast_ptrOrBoxEleTy(baseType), typeParams, nonconstantExtents);
  mlir::Type eleType =
      fir::unwrapSequenceType(fir::unwrapPassByRefType(baseType));
  if (fir::isRecordWithAllocatableMember(eleType)) {
    // The allocatable component descriptors need to be set to a clean
    // deallocated status before anything is done with them.
    mlir::Value box = rewriter.create<fir::EmboxOp>(
        loc, fir::BoxType::get(allocmem.getType()), allocmem, shape,
        /*slice=*/mlir::Value{}, typeParams);
    auto module = load->getParentOfType<mlir::ModuleOp>();
    FirOpBuilder builder(rewriter, module);
    runtime::genDerivedTypeInitialize(builder, loc, box);
    // Any allocatable component that may have been allocated must be
    // deallocated during the clean-up.
    auto cleanup = [=](mlir::PatternRewriter &r) {
      FirOpBuilder builder(r, module);
      runtime::genDerivedTypeDestroy(builder, loc, box);
      r.create<FreeMemOp>(loc, allocmem);
    };
    return {allocmem, cleanup};
  }
  auto cleanup = [=](mlir::PatternRewriter &r) {
    r.create<FreeMemOp>(loc, allocmem);
  };
  return {allocmem, cleanup};
}

namespace {
/// Conversion of fir.array_update and fir.array_modify Ops.
/// If there is a conflict for the update, then we need to perform a
/// copy-in/copy-out to preserve the original values of the array. If there is
/// no conflict, then it is save to eschew making any copies.
template <typename ArrayOp>
class ArrayUpdateConversionBase : public mlir::OpRewritePattern<ArrayOp> {
public:
  // TODO: Implement copy/swap semantics?
  explicit ArrayUpdateConversionBase(mlir::MLIRContext *ctx,
                                     const ArrayCopyAnalysisBase &a,
                                     const OperationUseMapT &m)
      : mlir::OpRewritePattern<ArrayOp>{ctx}, analysis{a}, useMap{m} {}

  /// The array_access, \p access, is to be to a cloned copy due to a potential
  /// conflict. Uses copy-in/copy-out semantics and not copy/swap.
  mlir::Value referenceToClone(mlir::Location loc,
                               mlir::PatternRewriter &rewriter,
                               ArrayOp access) const {
    LLVM_DEBUG(llvm::dbgs()
               << "generating copy-in/copy-out loops for " << access << '\n');
    auto *op = access.getOperation();
    auto *loadOp = useMap.lookup(op);
    auto load = mlir::cast<ArrayLoadOp>(loadOp);
    auto eleTy = access.getType();
    rewriter.setInsertionPoint(loadOp);
    // Copy in.
    llvm::SmallVector<mlir::Value> extents;
    bool copyUsingSlice = false;
    auto shapeOp = getOrReadExtentsAndShapeOp(loc, rewriter, load, extents,
                                              copyUsingSlice);
    auto [allocmem, genTempCleanUp] =
        allocateArrayTemp(loc, rewriter, load, extents, shapeOp);
    genArrayCopy</*copyIn=*/true>(load.getLoc(), rewriter, allocmem,
                                  load.getMemref(), shapeOp, load.getSlice(),
                                  load);
    // Generate the reference for the access.
    rewriter.setInsertionPoint(op);
    auto coor = genCoorOp(
        rewriter, loc, getEleTy(load.getType()), eleTy, allocmem, shapeOp,
        copyUsingSlice ? mlir::Value{} : load.getSlice(), access.getIndices(),
        load, access->hasAttr(factory::attrFortranArrayOffsets()));
    // Copy out.
    auto *storeOp = useMap.lookup(loadOp);
    auto store = mlir::cast<ArrayMergeStoreOp>(storeOp);
    rewriter.setInsertionPoint(storeOp);
    // Copy out.
    genArrayCopy</*copyIn=*/false>(store.getLoc(), rewriter, store.getMemref(),
                                   allocmem, shapeOp, store.getSlice(), load);
    genTempCleanUp(rewriter);
    return coor;
  }

  /// Copy the RHS element into the LHS and insert copy-in/copy-out between a
  /// temp and the LHS if the analysis found potential overlaps between the RHS
  /// and LHS arrays. The element copy generator must be provided in \p
  /// assignElement. \p update must be the ArrayUpdateOp or the ArrayModifyOp.
  /// Returns the address of the LHS element inside the loop and the LHS
  /// ArrayLoad result.
  std::pair<mlir::Value, mlir::Value>
  materializeAssignment(mlir::Location loc, mlir::PatternRewriter &rewriter,
                        ArrayOp update,
                        const std::function<void(mlir::Value)> &assignElement,
                        mlir::Type lhsEltRefType) const {
    auto *op = update.getOperation();
    auto *loadOp = useMap.lookup(op);
    auto load = mlir::cast<ArrayLoadOp>(loadOp);
    LLVM_DEBUG(llvm::outs() << "does " << load << " have a conflict?\n");
    if (analysis.hasPotentialConflict(loadOp)) {
      // If there is a conflict between the arrays, then we copy the lhs array
      // to a temporary, update the temporary, and copy the temporary back to
      // the lhs array. This yields Fortran's copy-in copy-out array semantics.
      LLVM_DEBUG(llvm::outs() << "Yes, conflict was found\n");
      rewriter.setInsertionPoint(loadOp);
      // Copy in.
      llvm::SmallVector<mlir::Value> extents;
      bool copyUsingSlice = false;
      auto shapeOp = getOrReadExtentsAndShapeOp(loc, rewriter, load, extents,
                                                copyUsingSlice);
      auto [allocmem, genTempCleanUp] =
          allocateArrayTemp(loc, rewriter, load, extents, shapeOp);

      genArrayCopy</*copyIn=*/true>(load.getLoc(), rewriter, allocmem,
                                    load.getMemref(), shapeOp, load.getSlice(),
                                    load);
      rewriter.setInsertionPoint(op);
      auto coor = genCoorOp(
          rewriter, loc, getEleTy(load.getType()), lhsEltRefType, allocmem,
          shapeOp, copyUsingSlice ? mlir::Value{} : load.getSlice(),
          update.getIndices(), load,
          update->hasAttr(factory::attrFortranArrayOffsets()));
      assignElement(coor);
      auto *storeOp = useMap.lookup(loadOp);
      auto store = mlir::cast<ArrayMergeStoreOp>(storeOp);
      rewriter.setInsertionPoint(storeOp);
      // Copy out.
      genArrayCopy</*copyIn=*/false>(store.getLoc(), rewriter,
                                     store.getMemref(), allocmem, shapeOp,
                                     store.getSlice(), load);
      genTempCleanUp(rewriter);
      return {coor, load.getResult()};
    }
    // Otherwise, when there is no conflict (a possible loop-carried
    // dependence), the lhs array can be updated in place.
    LLVM_DEBUG(llvm::outs() << "No, conflict wasn't found\n");
    rewriter.setInsertionPoint(op);
    auto coorTy = getEleTy(load.getType());
    auto coor =
        genCoorOp(rewriter, loc, coorTy, lhsEltRefType, load.getMemref(),
                  load.getShape(), load.getSlice(), update.getIndices(), load,
                  update->hasAttr(factory::attrFortranArrayOffsets()));
    assignElement(coor);
    return {coor, load.getResult()};
  }

protected:
  const ArrayCopyAnalysisBase &analysis;
  const OperationUseMapT &useMap;
};

class ArrayUpdateConversion : public ArrayUpdateConversionBase<ArrayUpdateOp> {
public:
  explicit ArrayUpdateConversion(mlir::MLIRContext *ctx,
                                 const ArrayCopyAnalysisBase &a,
                                 const OperationUseMapT &m)
      : ArrayUpdateConversionBase{ctx, a, m} {}

  llvm::LogicalResult
  matchAndRewrite(ArrayUpdateOp update,
                  mlir::PatternRewriter &rewriter) const override {
    auto loc = update.getLoc();
    auto assignElement = [&](mlir::Value coor) {
      auto input = update.getMerge();
      if (auto inEleTy = dyn_cast_ptrEleTy(input.getType())) {
        emitFatalError(loc, "array_update on references not supported");
      } else {
        rewriter.create<fir::StoreOp>(loc, input, coor);
      }
    };
    auto lhsEltRefType = toRefType(update.getMerge().getType());
    auto [_, lhsLoadResult] = materializeAssignment(
        loc, rewriter, update, assignElement, lhsEltRefType);
    update.replaceAllUsesWith(lhsLoadResult);
    rewriter.replaceOp(update, lhsLoadResult);
    return mlir::success();
  }
};

class ArrayModifyConversion : public ArrayUpdateConversionBase<ArrayModifyOp> {
public:
  explicit ArrayModifyConversion(mlir::MLIRContext *ctx,
                                 const ArrayCopyAnalysisBase &a,
                                 const OperationUseMapT &m)
      : ArrayUpdateConversionBase{ctx, a, m} {}

  llvm::LogicalResult
  matchAndRewrite(ArrayModifyOp modify,
                  mlir::PatternRewriter &rewriter) const override {
    auto loc = modify.getLoc();
    auto assignElement = [](mlir::Value) {
      // Assignment already materialized by lowering using lhs element address.
    };
    auto lhsEltRefType = modify.getResult(0).getType();
    auto [lhsEltCoor, lhsLoadResult] = materializeAssignment(
        loc, rewriter, modify, assignElement, lhsEltRefType);
    modify.replaceAllUsesWith(mlir::ValueRange{lhsEltCoor, lhsLoadResult});
    rewriter.replaceOp(modify, mlir::ValueRange{lhsEltCoor, lhsLoadResult});
    return mlir::success();
  }
};

class ArrayFetchConversion : public mlir::OpRewritePattern<ArrayFetchOp> {
public:
  explicit ArrayFetchConversion(mlir::MLIRContext *ctx,
                                const OperationUseMapT &m)
      : OpRewritePattern{ctx}, useMap{m} {}

  llvm::LogicalResult
  matchAndRewrite(ArrayFetchOp fetch,
                  mlir::PatternRewriter &rewriter) const override {
    auto *op = fetch.getOperation();
    rewriter.setInsertionPoint(op);
    auto load = mlir::cast<ArrayLoadOp>(useMap.lookup(op));
    auto loc = fetch.getLoc();
    auto coor = genCoorOp(
        rewriter, loc, getEleTy(load.getType()), toRefType(fetch.getType()),
        load.getMemref(), load.getShape(), load.getSlice(), fetch.getIndices(),
        load, fetch->hasAttr(factory::attrFortranArrayOffsets()));
    if (isa_ref_type(fetch.getType()))
      rewriter.replaceOp(fetch, coor);
    else
      rewriter.replaceOpWithNewOp<fir::LoadOp>(fetch, coor);
    return mlir::success();
  }

private:
  const OperationUseMapT &useMap;
};

/// As array_access op is like an array_fetch op, except that it does not imply
/// a load op. (It operates in the reference domain.)
class ArrayAccessConversion : public ArrayUpdateConversionBase<ArrayAccessOp> {
public:
  explicit ArrayAccessConversion(mlir::MLIRContext *ctx,
                                 const ArrayCopyAnalysisBase &a,
                                 const OperationUseMapT &m)
      : ArrayUpdateConversionBase{ctx, a, m} {}

  llvm::LogicalResult
  matchAndRewrite(ArrayAccessOp access,
                  mlir::PatternRewriter &rewriter) const override {
    auto *op = access.getOperation();
    auto loc = access.getLoc();
    if (analysis.inAmendAccessSet(op)) {
      // This array_access is associated with an array_amend and there is a
      // conflict. Make a copy to store into.
      auto result = referenceToClone(loc, rewriter, access);
      access.replaceAllUsesWith(result);
      rewriter.replaceOp(access, result);
      return mlir::success();
    }
    rewriter.setInsertionPoint(op);
    auto load = mlir::cast<ArrayLoadOp>(useMap.lookup(op));
    auto coor = genCoorOp(
        rewriter, loc, getEleTy(load.getType()), toRefType(access.getType()),
        load.getMemref(), load.getShape(), load.getSlice(), access.getIndices(),
        load, access->hasAttr(factory::attrFortranArrayOffsets()));
    rewriter.replaceOp(access, coor);
    return mlir::success();
  }
};

/// An array_amend op is a marker to record which array access is being used to
/// update an array value. After this pass runs, an array_amend has no
/// semantics. We rewrite these to undefined values here to remove them while
/// preserving SSA form.
class ArrayAmendConversion : public mlir::OpRewritePattern<ArrayAmendOp> {
public:
  explicit ArrayAmendConversion(mlir::MLIRContext *ctx)
      : OpRewritePattern{ctx} {}

  llvm::LogicalResult
  matchAndRewrite(ArrayAmendOp amend,
                  mlir::PatternRewriter &rewriter) const override {
    auto *op = amend.getOperation();
    rewriter.setInsertionPoint(op);
    auto loc = amend.getLoc();
    auto undef = rewriter.create<UndefOp>(loc, amend.getType());
    rewriter.replaceOp(amend, undef.getResult());
    return mlir::success();
  }
};

class ArrayValueCopyConverter
    : public fir::impl::ArrayValueCopyBase<ArrayValueCopyConverter> {
public:
  ArrayValueCopyConverter() = default;
  ArrayValueCopyConverter(const fir::ArrayValueCopyOptions &options)
      : Base(options) {}

  void runOnOperation() override {
    auto func = getOperation();
    LLVM_DEBUG(llvm::dbgs() << "\n\narray-value-copy pass on function '"
                            << func.getName() << "'\n");
    auto *context = &getContext();

    // Perform the conflict analysis.
    const ArrayCopyAnalysisBase *analysis;
    if (optimizeConflicts)
      analysis = &getAnalysis<ArrayCopyAnalysisOptimized>();
    else
      analysis = &getAnalysis<ArrayCopyAnalysis>();

    const auto &useMap = analysis->getUseMap();

    mlir::RewritePatternSet patterns1(context);
    patterns1.insert<ArrayFetchConversion>(context, useMap);
    patterns1.insert<ArrayUpdateConversion>(context, *analysis, useMap);
    patterns1.insert<ArrayModifyConversion>(context, *analysis, useMap);
    patterns1.insert<ArrayAccessConversion>(context, *analysis, useMap);
    patterns1.insert<ArrayAmendConversion>(context);
    mlir::ConversionTarget target(*context);
    target
        .addLegalDialect<FIROpsDialect, mlir::scf::SCFDialect,
                         mlir::arith::ArithDialect, mlir::func::FuncDialect>();
    target.addIllegalOp<ArrayAccessOp, ArrayAmendOp, ArrayFetchOp,
                        ArrayUpdateOp, ArrayModifyOp>();
    // Rewrite the array fetch and array update ops.
    if (mlir::failed(
            mlir::applyPartialConversion(func, target, std::move(patterns1)))) {
      mlir::emitError(mlir::UnknownLoc::get(context),
                      "failure in array-value-copy pass, phase 1");
      signalPassFailure();
    }

    mlir::RewritePatternSet patterns2(context);
    patterns2.insert<ArrayLoadConversion>(context);
    patterns2.insert<ArrayMergeStoreConversion>(context);
    target.addIllegalOp<ArrayLoadOp, ArrayMergeStoreOp>();
    if (mlir::failed(
            mlir::applyPartialConversion(func, target, std::move(patterns2)))) {
      mlir::emitError(mlir::UnknownLoc::get(context),
                      "failure in array-value-copy pass, phase 2");
      signalPassFailure();
    }
  }
};
} // namespace

std::unique_ptr<mlir::Pass>
fir::createArrayValueCopyPass(fir::ArrayValueCopyOptions options) {
  return std::make_unique<ArrayValueCopyConverter>(options);
}