SILowerI1Copies.cpp | Explore in Territory

//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
// form and a wave-level control flow graph.
//
// Before this pass, values that are semantically i1 and are defined and used
// within the same basic block are already represented as lane masks in scalar
// registers. However, values that cross basic blocks are always transferred
// between basic blocks in vreg_1 virtual registers and are lowered by this
// pass.
//
// The only instructions that use or define vreg_1 virtual registers are COPY,
// PHI, and IMPLICIT_DEF.
//
//===----------------------------------------------------------------------===//

#include "SILowerI1Copies.h"
#include "AMDGPU.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/CGPassBuilderOption.h"

#define DEBUG_TYPE …

usingnamespacellvm;

static Register
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
                    MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);

namespace {

class Vreg1LoweringHelper : public PhiLoweringHelper { … };

Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
                                         MachineDominatorTree *DT,
                                         MachinePostDominatorTree *PDT)
    : … { … }

bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) { … }

/// Helper class that determines the relationship between incoming values of a
/// phi in the control flow graph to determine where an incoming value can
/// simply be taken as a scalar lane mask as-is, and where it needs to be
/// merged with another, previously defined lane mask.
///
/// The approach is as follows:
///  - Determine all basic blocks which, starting from the incoming blocks,
///    a wave may reach before entering the def block (the block containing the
///    phi).
///  - If an incoming block has no predecessors in this set, we can take the
///    incoming value as a scalar lane mask as-is.
///  -- A special case of this is when the def block has a self-loop.
///  - Otherwise, the incoming value needs to be merged with a previously
///    defined lane mask.
///  - If there is a path into the set of reachable blocks that does _not_ go
///    through an incoming block where we can take the scalar lane mask as-is,
///    we need to invent an available value for the SSAUpdater. Choices are
///    0 and undef, with differing consequences for how to merge values etc.
///
/// TODO: We could use region analysis to quickly skip over SESE regions during
///       the traversal.
///
class PhiIncomingAnalysis { … };

/// Helper class that detects loops which require us to lower an i1 COPY into
/// bitwise manipulation.
///
/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
/// between loops with the same header. Consider this example:
///
///  A-+-+
///  | | |
///  B-+ |
///  |   |
///  C---+
///
/// A is the header of a loop containing A, B, and C as far as LoopInfo is
/// concerned. However, an i1 COPY in B that is used in C must be lowered to
/// bitwise operations to combine results from different loop iterations when
/// B has a divergent branch (since by default we will compile this code such
/// that threads in a wave are merged at the entry of C).
///
/// The following rule is implemented to determine whether bitwise operations
/// are required: use the bitwise lowering for a def in block B if a backward
/// edge to B is reachable without going through the nearest common
/// post-dominator of B and all uses of the def.
///
/// TODO: This rule is conservative because it does not check whether the
///       relevant branches are actually divergent.
///
/// The class is designed to cache the CFG traversal so that it can be re-used
/// for multiple defs within the same basic block.
///
/// TODO: We could use region analysis to quickly skip over SESE regions during
///       the traversal.
///
class LoopFinder { … };

} // End anonymous namespace.

Register
llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
                        MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { … }

static Register
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
                    MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) { … }

#ifndef NDEBUG
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
                                const MachineRegisterInfo &MRI,
                                Register Reg) {
  unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
  return Size == 1 || Size == 32;
}
#endif

bool Vreg1LoweringHelper::lowerCopiesFromI1() { … }

PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF,
                                     MachineDominatorTree *DT,
                                     MachinePostDominatorTree *PDT)
    : … { … }

bool PhiLoweringHelper::lowerPhis() { … }

bool Vreg1LoweringHelper::lowerCopiesToI1() { … }

bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const { … }

static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { … }

/// Return a point at the end of the given \p MBB to insert SALU instructions
/// for lane mask calculation. Take terminators and SCC into account.
MachineBasicBlock::iterator
PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { … }

// VReg_1 -> SReg_32 or SReg_64
void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const { … }

void Vreg1LoweringHelper::getCandidatesForLowering(
    SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { … }

void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
    const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { … }

void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
                                        MachineBasicBlock *MBB) { … }

void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator I,
                                              const DebugLoc &DL,
                                              Register DstReg, Register PrevReg,
                                              Register CurReg) { … }

void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { … }

/// Lower all instructions that def or use vreg_1 registers.
///
/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
/// occur around inline assembly. We do this first, before vreg_1 registers
/// are changed to scalar mask registers.
///
/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
/// all others, because phi lowering looks through copies and can therefore
/// often make copy lowering unnecessary.
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT,
                           MachinePostDominatorTree &MPDT) { … }

PreservedAnalyses
SILowerI1CopiesPass::run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM) { … }

class SILowerI1CopiesLegacy : public MachineFunctionPass { … };

bool SILowerI1CopiesLegacy::runOnMachineFunction(MachineFunction &MF) { … }

INITIALIZE_PASS_BEGIN(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies",
                      false, false)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SILowerI1CopiesLegacy, DEBUG_TYPE, "SI Lower i1 Copies",
                    false, false)

char SILowerI1CopiesLegacy::ID = …;

char &llvm::SILowerI1CopiesLegacyID = …;

FunctionPass *llvm::createSILowerI1CopiesLegacyPass() { … }
llvm/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp