//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass lowers all occurrences of i1 values (with a vreg_1 register class) // to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA // form and a wave-level control flow graph. // // Before this pass, values that are semantically i1 and are defined and used // within the same basic block are already represented as lane masks in scalar // registers. However, values that cross basic blocks are always transferred // between basic blocks in vreg_1 virtual registers and are lowered by this // pass. // // The only instructions that use or define vreg_1 virtual registers are COPY, // PHI, and IMPLICIT_DEF. // //===----------------------------------------------------------------------===// #include "SILowerI1Copies.h" #include "AMDGPU.h" #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/InitializePasses.h" #include "llvm/Target/CGPassBuilderOption.h" #define DEBUG_TYPE … usingnamespacellvm; static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs); namespace { class Vreg1LoweringHelper : public PhiLoweringHelper { … }; Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT) : … { … } bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { … } /// Helper class that determines the relationship between incoming values of a /// phi in the control flow graph to determine where an incoming value can /// simply be taken as a scalar lane mask as-is, and where it needs to be /// merged with another, previously defined lane mask. /// /// The approach is as follows: /// - Determine all basic blocks which, starting from the incoming blocks, /// a wave may reach before entering the def block (the block containing the /// phi). /// - If an incoming block has no predecessors in this set, we can take the /// incoming value as a scalar lane mask as-is. /// -- A special case of this is when the def block has a self-loop. /// - Otherwise, the incoming value needs to be merged with a previously /// defined lane mask. /// - If there is a path into the set of reachable blocks that does _not_ go /// through an incoming block where we can take the scalar lane mask as-is, /// we need to invent an available value for the SSAUpdater. Choices are /// 0 and undef, with differing consequences for how to merge values etc. /// /// TODO: We could use region analysis to quickly skip over SESE regions during /// the traversal. /// class PhiIncomingAnalysis { … }; /// Helper class that detects loops which require us to lower an i1 COPY into /// bitwise manipulation. /// /// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish /// between loops with the same header. Consider this example: /// /// A-+-+ /// | | | /// B-+ | /// | | /// C---+ /// /// A is the header of a loop containing A, B, and C as far as LoopInfo is /// concerned. However, an i1 COPY in B that is used in C must be lowered to /// bitwise operations to combine results from different loop iterations when /// B has a divergent branch (since by default we will compile this code such /// that threads in a wave are merged at the entry of C). /// /// The following rule is implemented to determine whether bitwise operations /// are required: use the bitwise lowering for a def in block B if a backward /// edge to B is reachable without going through the nearest common /// post-dominator of B and all uses of the def. /// /// TODO: This rule is conservative because it does not check whether the /// relevant branches are actually divergent. /// /// The class is designed to cache the CFG traversal so that it can be re-used /// for multiple defs within the same basic block. /// /// TODO: We could use region analysis to quickly skip over SESE regions during /// the traversal. /// class LoopFinder { … }; } // End anonymous namespace. Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { … } static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { … } #ifndef NDEBUG static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg) { unsigned Size = TRI.getRegSizeInBits(Reg, MRI); return Size == 1 || Size == 32; } #endif bool Vreg1LoweringHelper::lowerCopiesFromI1() { … } PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT) : … { … } bool PhiLoweringHelper::lowerPhis() { … } bool Vreg1LoweringHelper::lowerCopiesToI1() { … } bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { … } static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { … } /// Return a point at the end of the given \p MBB to insert SALU instructions /// for lane mask calculation. Take terminators and SCC into account. MachineBasicBlock::iterator PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { … } // VReg_1 -> SReg_32 or SReg_64 void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { … } void Vreg1LoweringHelper::getCandidatesForLowering( SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { … } void Vreg1LoweringHelper::collectIncomingValuesFromPhi( const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { … } void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB) { … } void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg) { … } void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { … } /// Lower all instructions that def or use vreg_1 registers. /// /// In a first pass, we lower COPYs from vreg_1 to vector registers, as can /// occur around inline assembly. We do this first, before vreg_1 registers /// are changed to scalar mask registers. /// /// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before /// all others, because phi lowering looks through copies and can therefore /// often make copy lowering unnecessary. static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT) { … } PreservedAnalyses SILowerI1CopiesPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { … } class SILowerI1CopiesLegacy : public MachineFunctionPass { … }; bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) { … } INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies", false, false) char SILowerI1CopiesLegacy::ID = …; char &llvm::SILowerI1CopiesLegacyID = …; FunctionPass *llvm::createSILowerI1CopiesLegacyPass() { … }