//===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file This pass does a few optimisations related to Tail predicated loops /// and MVE VPT blocks before register allocation is performed. For VPT blocks /// the goal is to maximize the sizes of the blocks that will be created by the /// MVE VPT Block Insertion pass (which runs after register allocation). For /// tail predicated loops we transform the loop into something that will /// hopefully make the backend ARMLowOverheadLoops pass's job easier. /// //===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "MVETailPredUtils.h" #include "Thumb2InstrInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include <cassert> usingnamespacellvm; #define DEBUG_TYPE … static cl::opt<bool> MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, cl::desc("Enable merging Loop End and Dec instructions."), cl::init(true)); static cl::opt<bool> SetLRPredicate("arm-set-lr-predicate", cl::Hidden, cl::desc("Enable setting lr as a predicate in tail predication regions."), cl::init(true)); namespace { class MVETPAndVPTOptimisations : public MachineFunctionPass { … }; char MVETPAndVPTOptimisations::ID = …; } // end anonymous namespace INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, "ARM MVE TailPred and VPT Optimisations pass", false, false) static MachineInstr *LookThroughCOPY(MachineInstr *MI, MachineRegisterInfo *MRI) { … } // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and // corresponding PHI that make up a low overhead loop. Only handles 'do' loops // at the moment, returning a t2DoLoopStart in LoopStart. static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, MachineInstr *&LoopStart, MachineInstr *&LoopPhi, MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { … } static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) { … } // The Hardware Loop insertion and ISel Lowering produce the pseudos for the // start of a while loop: // %a:gprlr = t2WhileLoopSetup %Cnt // t2WhileLoopStart %a, %BB // We want to convert those to a single instruction which, like t2LoopEndDec and // t2DoLoopStartTP is both a terminator and produces a value: // %a:grplr: t2WhileLoopStartLR %Cnt, %BB // // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and // t2WhileLoopStart are not valid past regalloc. bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) { … } // Return true if this instruction is invalid in a low overhead loop, usually // because it clobbers LR. static bool IsInvalidTPInstruction(MachineInstr &MI) { … } // Starting from PreHeader, search for invalid instructions back until the // LoopStart block is reached. If invalid instructions are found, the loop start // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will // return the new DLS LoopStart if updated. MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( MachineBasicBlock *PreHeader, MachineInstr *LoopStart) { … } // This function converts loops with t2LoopEnd and t2LoopEnd instructions into // a single t2LoopEndDec instruction. To do that it needs to make sure that LR // will be valid to be used for the low overhead loop, which means nothing else // is using LR (especially calls) and there are no superfluous copies in the // loop. The t2LoopEndDec is a branching terminator that produces a value (the // decrement) around the loop edge, which means we need to be careful that they // will be valid to allocate without any spilling. bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { … } // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP // instruction, making the backend ARMLowOverheadLoops passes job of finding the // VCTP operand much simpler. bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT) { … } // Returns true if Opcode is any VCMP Opcode. static bool IsVCMP(unsigned Opcode) { … } // Returns true if a VCMP with this Opcode can have its operands swapped. // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, // and VCMPr instructions (since the r is always on the right). static bool CanHaveSwappedOperands(unsigned Opcode) { … } // Returns the CondCode of a VCMP Instruction. static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { … } // Returns true if Cond is equivalent to a VPNOT instruction on the result of // Prev. Cond and Prev must be VCMPs. static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { … } // Returns true if Instr writes to VCCR. static bool IsWritingToVCCR(MachineInstr &Instr) { … } // Transforms // <Instr that uses %A ('User' Operand)> // Into // %K = VPNOT %Target // <Instr that uses %K ('User' Operand)> // And returns the newly inserted VPNOT. // This optimization is done in the hopes of preventing spills/reloads of VPR by // reducing the number of VCCR values with overlapping lifetimes. MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT( MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, Register Target) { … } // Moves a VPNOT before its first user if an instruction that uses Reg is found // in-between the VPNOT and its user. // Returns true if there is at least one user of the VPNOT in the block. static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, Register Reg) { … } // This optimisation attempts to reduce the number of overlapping lifetimes of // VCCR values by replacing uses of old VCCR values with VPNOTs. For example, // this replaces // %A:vccr = (something) // %B:vccr = VPNOT %A // %Foo = (some op that uses %B) // %Bar = (some op that uses %A) // With // %A:vccr = (something) // %B:vccr = VPNOT %A // %Foo = (some op that uses %B) // %TMP2:vccr = VPNOT %B // %Bar = (some op that uses %A) bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { … } // This optimisation replaces VCMPs with VPNOTs when they are equivalent. bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { … } bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT) { … } // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a // somewhat blunt approximation to allow tail predicated with vpsel // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly // different semantics under tail predication. Until that is modelled we just // convert to a VMOVT (via a predicated VORR) instead. bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { … } // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as // the instruction may be removable as a noop. bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) { … } bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { … } /// createMVETPAndVPTOptimisationsPass FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() { … }