ARMLowOverheadLoops.cpp | Explore in Territory

//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
/// instructions into machine operations.
/// The expectation is that the loop contains three pseudo instructions:
/// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
///   form should be in the preheader, whereas the while form should be in the
///   preheaders only predecessor.
/// - t2LoopDec - placed within in the loop body.
/// - t2LoopEnd - the loop latch terminator.
///
/// In addition to this, we also look for the presence of the VCTP instruction,
/// which determines whether we can generated the tail-predicated low-overhead
/// loop form.
///
/// Assumptions and Dependencies:
/// Low-overhead loops are constructed and executed using a setup instruction:
/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
/// but fixed polarity: WLS can only branch forwards and LE can only branch
/// backwards. These restrictions mean that this pass is dependent upon block
/// layout and block sizes, which is why it's the last pass to run. The same is
/// true for ConstantIslands, but this pass does not increase the size of the
/// basic blocks, nor does it change the CFG. Instructions are mainly removed
/// during the transform and pseudo instructions are replaced by real ones. In
/// some cases, when we have to revert to a 'normal' loop, we have to introduce
/// multiple instructions for a single pseudo (see RevertWhile and
/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
/// are defined to be as large as this maximum sequence of replacement
/// instructions.
///
/// A note on VPR.P0 (the lane mask):
/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
/// They will simply "and" the result of their calculation with the current
/// value of VPR.P0. You can think of it like this:
/// \verbatim
/// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
///   VPR.P0 &= Value
/// else
///   VPR.P0 = Value
/// \endverbatim
/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
/// fall in the "VPT active" case, so we can consider that all VPR writes by
/// one of those instruction is actually a "and".
//===----------------------------------------------------------------------===//

#include "ARM.h"
#include "ARMBaseInstrInfo.h"
#include "ARMBaseRegisterInfo.h"
#include "ARMBasicBlockInfo.h"
#include "ARMSubtarget.h"
#include "MVETailPredUtils.h"
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/ReachingDefAnalysis.h"
#include "llvm/MC/MCInstrDesc.h"

usingnamespacellvm;

#define DEBUG_TYPE …
#define ARM_LOW_OVERHEAD_LOOPS_NAME …

static cl::opt<bool>
DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
    cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
    cl::init(false));

static cl::opt<bool>
    DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
                   cl::desc("Disable omitting 'dls lr, lr' instructions"),
                   cl::init(false));

static bool isVectorPredicated(MachineInstr *MI) { … }

static bool isVectorPredicate(MachineInstr *MI) { … }

static bool hasVPRUse(MachineInstr &MI) { … }

static bool isDomainMVE(MachineInstr *MI) { … }

static int getVecSize(const MachineInstr &MI) { … }

static bool shouldInspect(MachineInstr &MI) { … }

static bool isHorizontalReduction(const MachineInstr &MI) { … }

namespace {

  InstSet;

  class PostOrderLoopTraversal { … };

  class VPTBlock { … };

  // Represent the current state of the VPR and hold all instances which
  // represent a VPT block, which is a list of instructions that begins with a
  // VPT/VPST and has a maximum of four proceeding instructions. All
  // instructions within the block are predicated upon the vpr and we allow
  // instructions to define the vpr within in the block too.
  class VPTState { … };

  struct LowOverheadLoop { … };

  class ARMLowOverheadLoops : public MachineFunctionPass { … };
}

char ARMLowOverheadLoops::ID = …;

INITIALIZE_PASS(…)

static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
                      InstSet &ToRemove, InstSet &Ignore) { … }

bool LowOverheadLoop::ValidateTailPredicate() { … }

static bool isRegInClass(const MachineOperand &MO,
                         const TargetRegisterClass *Class) { … }

// MVE 'narrowing' operate on half a lane, reading from half and writing
// to half, which are referred to has the top and bottom half. The other
// half retains its previous value.
static bool retainsPreviousHalfElement(const MachineInstr &MI) { … }

// Some MVE instructions read from the top/bottom halves of their operand(s)
// and generate a vector result with result elements that are double the
// width of the input.
static bool producesDoubleWidthResult(const MachineInstr &MI) { … }

// Can this instruction generate a non-zero result when given only zeroed
// operands? This allows us to know that, given operands with false bytes
// zeroed by masked loads, that the result will also contain zeros in those
// bytes.
static bool canGenerateNonZeros(const MachineInstr &MI) { … }

// Look at its register uses to see if it only can only receive zeros
// into its false lanes which would then produce zeros. Also check that
// the output register is also defined by an FalseLanesZero instruction
// so that if tail-predication happens, the lanes that aren't updated will
// still be zeros.
static bool producesFalseLanesZero(MachineInstr &MI,
                                   const TargetRegisterClass *QPRs,
                                   const ReachingDefAnalysis &RDA,
                                   InstSet &FalseLanesZero) { … }

bool LowOverheadLoop::ValidateLiveOuts() { … }

void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { … }

bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { … }

static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { … }

bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { … }

bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { … }

bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { … }

// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
// beq that branches to the exit branch.
// TODO: We could also try to generate a cbz if the value in LR is also in
// another low register.
void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { … }

void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { … }

bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { … }

// Generate a subs, or sub and cmp, and a branch instead of an LE.
void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { … }

// Generate a subs, or sub and cmp, and a branch instead of an LE.
void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { … }

// Perform dead code elimation on the loop iteration count setup expression.
// If we are tail-predicating, the number of elements to be processed is the
// operand of the VCTP instruction in the vector body, see getCount(), which is
// register $r3 in this example:
//
//   $lr = big-itercount-expression
//   ..
//   $lr = t2DoLoopStart renamable $lr
//   vector.body:
//     ..
//     $vpr = MVE_VCTP32 renamable $r3
//     renamable $lr = t2LoopDec killed renamable $lr, 1
//     t2LoopEnd renamable $lr, %vector.body
//     tB %end
//
// What we would like achieve here is to replace the do-loop start pseudo
// instruction t2DoLoopStart with:
//
//    $lr = MVE_DLSTP_32 killed renamable $r3
//
// Thus, $r3 which defines the number of elements, is written to $lr,
// and then we want to delete the whole chain that used to define $lr,
// see the comment below how this chain could look like.
//
void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { … }

MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { … }

void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { … }

void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { … }

bool ARMLowOverheadLoops::RevertNonLoops() { … }

FunctionPass *llvm::createARMLowOverheadLoopsPass() { … }
llvm/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp