//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file /// Finalize v8.1-m low-overhead loops by converting the associated pseudo /// instructions into machine operations. /// The expectation is that the loop contains three pseudo instructions: /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop /// form should be in the preheader, whereas the while form should be in the /// preheaders only predecessor. /// - t2LoopDec - placed within in the loop body. /// - t2LoopEnd - the loop latch terminator. /// /// In addition to this, we also look for the presence of the VCTP instruction, /// which determines whether we can generated the tail-predicated low-overhead /// loop form. /// /// Assumptions and Dependencies: /// Low-overhead loops are constructed and executed using a setup instruction: /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range /// but fixed polarity: WLS can only branch forwards and LE can only branch /// backwards. These restrictions mean that this pass is dependent upon block /// layout and block sizes, which is why it's the last pass to run. The same is /// true for ConstantIslands, but this pass does not increase the size of the /// basic blocks, nor does it change the CFG. Instructions are mainly removed /// during the transform and pseudo instructions are replaced by real ones. In /// some cases, when we have to revert to a 'normal' loop, we have to introduce /// multiple instructions for a single pseudo (see RevertWhile and /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd /// are defined to be as large as this maximum sequence of replacement /// instructions. /// /// A note on VPR.P0 (the lane mask): /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a /// "VPT Active" context (which includes low-overhead loops and vpt blocks). /// They will simply "and" the result of their calculation with the current /// value of VPR.P0. You can think of it like this: /// \verbatim /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs /// VPR.P0 &= Value /// else /// VPR.P0 = Value /// \endverbatim /// When we're inside the low-overhead loop (between DLSTP and LETP), we always /// fall in the "VPT active" case, so we can consider that all VPR writes by /// one of those instruction is actually a "and". //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMBasicBlockInfo.h" #include "ARMSubtarget.h" #include "MVETailPredUtils.h" #include "Thumb2InstrInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" #include "llvm/MC/MCInstrDesc.h" usingnamespacellvm; #define DEBUG_TYPE … #define ARM_LOW_OVERHEAD_LOOPS_NAME … static cl::opt<bool> DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden, cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"), cl::init(false)); static cl::opt<bool> DisableOmitDLS("arm-disable-omit-dls", cl::Hidden, cl::desc("Disable omitting 'dls lr, lr' instructions"), cl::init(false)); static bool isVectorPredicated(MachineInstr *MI) { … } static bool isVectorPredicate(MachineInstr *MI) { … } static bool hasVPRUse(MachineInstr &MI) { … } static bool isDomainMVE(MachineInstr *MI) { … } static int getVecSize(const MachineInstr &MI) { … } static bool shouldInspect(MachineInstr &MI) { … } static bool isHorizontalReduction(const MachineInstr &MI) { … } namespace { InstSet; class PostOrderLoopTraversal { … }; class VPTBlock { … }; // Represent the current state of the VPR and hold all instances which // represent a VPT block, which is a list of instructions that begins with a // VPT/VPST and has a maximum of four proceeding instructions. All // instructions within the block are predicated upon the vpr and we allow // instructions to define the vpr within in the block too. class VPTState { … }; struct LowOverheadLoop { … }; class ARMLowOverheadLoops : public MachineFunctionPass { … }; } char ARMLowOverheadLoops::ID = …; INITIALIZE_PASS(…) static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, InstSet &ToRemove, InstSet &Ignore) { … } bool LowOverheadLoop::ValidateTailPredicate() { … } static bool isRegInClass(const MachineOperand &MO, const TargetRegisterClass *Class) { … } // MVE 'narrowing' operate on half a lane, reading from half and writing // to half, which are referred to has the top and bottom half. The other // half retains its previous value. static bool retainsPreviousHalfElement(const MachineInstr &MI) { … } // Some MVE instructions read from the top/bottom halves of their operand(s) // and generate a vector result with result elements that are double the // width of the input. static bool producesDoubleWidthResult(const MachineInstr &MI) { … } // Can this instruction generate a non-zero result when given only zeroed // operands? This allows us to know that, given operands with false bytes // zeroed by masked loads, that the result will also contain zeros in those // bytes. static bool canGenerateNonZeros(const MachineInstr &MI) { … } // Look at its register uses to see if it only can only receive zeros // into its false lanes which would then produce zeros. Also check that // the output register is also defined by an FalseLanesZero instruction // so that if tail-predication happens, the lanes that aren't updated will // still be zeros. static bool producesFalseLanesZero(MachineInstr &MI, const TargetRegisterClass *QPRs, const ReachingDefAnalysis &RDA, InstSet &FalseLanesZero) { … } bool LowOverheadLoop::ValidateLiveOuts() { … } void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { … } bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { … } static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { … } bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { … } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { … } bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { … } // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a // beq that branches to the exit branch. // TODO: We could also try to generate a cbz if the value in LR is also in // another low register. void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { … } void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { … } bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { … } // Generate a subs, or sub and cmp, and a branch instead of an LE. void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { … } // Generate a subs, or sub and cmp, and a branch instead of an LE. void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { … } // Perform dead code elimation on the loop iteration count setup expression. // If we are tail-predicating, the number of elements to be processed is the // operand of the VCTP instruction in the vector body, see getCount(), which is // register $r3 in this example: // // $lr = big-itercount-expression // .. // $lr = t2DoLoopStart renamable $lr // vector.body: // .. // $vpr = MVE_VCTP32 renamable $r3 // renamable $lr = t2LoopDec killed renamable $lr, 1 // t2LoopEnd renamable $lr, %vector.body // tB %end // // What we would like achieve here is to replace the do-loop start pseudo // instruction t2DoLoopStart with: // // $lr = MVE_DLSTP_32 killed renamable $r3 // // Thus, $r3 which defines the number of elements, is written to $lr, // and then we want to delete the whole chain that used to define $lr, // see the comment below how this chain could look like. // void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { … } MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { … } void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { … } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { … } bool ARMLowOverheadLoops::RevertNonLoops() { … } FunctionPass *llvm::createARMLowOverheadLoopsPass() { … }