AMDGPUPromoteAlloca.cpp | Explore in Territory

//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Eliminates allocas by either converting them into vectors or by migrating
// them to local address space.
//
// Two passes are exposed by this file:
//    - "promote-alloca-to-vector", which runs early in the pipeline and only
//      promotes to vector. Promotion to vector is almost always profitable
//      except when the alloca is too big and the promotion would result in
//      very high register pressure.
//    - "promote-alloca", which does both promotion to vector and LDS and runs
//      much later in the pipeline. This runs after SROA because promoting to
//      LDS is of course less profitable than getting rid of the alloca or
//      vectorizing it, thus we only want to do it when the only alternative is
//      lowering the alloca to stack.
//
// Note that both of them exist for the old and new PMs. The new PM passes are
// declared in AMDGPU.h and the legacy PM ones are declared here.s
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"

#define DEBUG_TYPE …

usingnamespacellvm;

namespace {

static cl::opt<bool>
    DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
                                 cl::desc("Disable promote alloca to vector"),
                                 cl::init(false));

static cl::opt<bool>
    DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
                              cl::desc("Disable promote alloca to LDS"),
                              cl::init(false));

static cl::opt<unsigned> PromoteAllocaToVectorLimit(
    "amdgpu-promote-alloca-to-vector-limit",
    cl::desc("Maximum byte size to consider promote alloca to vector"),
    cl::init(0));

static cl::opt<unsigned>
    LoopUserWeight("promote-alloca-vector-loop-user-weight",
                   cl::desc("The bonus weight of users of allocas within loop "
                            "when sorting profitable allocas"),
                   cl::init(4));

// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl { … };

// FIXME: This can create globals so should be a module pass.
class AMDGPUPromoteAlloca : public FunctionPass { … };

class AMDGPUPromoteAllocaToVector : public FunctionPass { … };

unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) { … }

} // end anonymous namespace

char AMDGPUPromoteAlloca::ID = …;
char AMDGPUPromoteAllocaToVector::ID = …;

INITIALIZE_PASS_BEGIN(…)

INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                      "AMDGPU promote alloca to vector", false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                    "AMDGPU promote alloca to vector", false, false)

char &llvm::AMDGPUPromoteAllocaID = …;
char &llvm::AMDGPUPromoteAllocaToVectorID = …;

PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                               FunctionAnalysisManager &AM) { … }

PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { … }

FunctionPass *llvm::createAMDGPUPromoteAlloca() { … }

FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { … }

static void collectAllocaUses(AllocaInst &Alloca,
                              SmallVectorImpl<Use *> &Uses) { … }

void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
    SmallVectorImpl<AllocaInst *> &Allocas) { … }

bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { … }

struct MemTransferInfo { … };

// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
                              const DataLayout &DL) { … }

static Value *
calculateVectorIndex(Value *Ptr,
                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) { … }

static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
                               Type *VecElemTy, const DataLayout &DL) { … }

/// Promotes a single user of the alloca to a vector form.
///
/// \param Inst           Instruction to be promoted.
/// \param DL             Module Data Layout.
/// \param VectorTy       Vectorized Type.
/// \param VecStoreSize   Size of \p VectorTy in bytes.
/// \param ElementSize    Size of \p VectorTy element type in bytes.
/// \param TransferInfo   MemTransferInst info map.
/// \param GEPVectorIdx   GEP -> VectorIdx cache.
/// \param CurVal         Current value of the vector (e.g. last stored value)
/// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
///              be promoted now. This happens when promoting requires \p
///              CurVal, but \p CurVal is nullptr.
/// \return the stored value if \p Inst would have written to the alloca, or
///         nullptr otherwise.
static Value *promoteAllocaUserToVector(
    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
    unsigned VecStoreSize, unsigned ElementSize,
    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
    SmallVectorImpl<LoadInst *> &DeferredLoads) { … }

static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
                                  const DataLayout &DL) { … }

/// Iterates over an instruction worklist that may contain multiple instructions
/// from the same basic block, but in a different order.
template <typename InstContainer>
static void forEachWorkListItem(const InstContainer &WorkList,
                                std::function<void(Instruction *)> Fn) { … }

// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { … }

std::pair<Value *, Value *>
AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { … }

Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
                                              unsigned N) { … }

static bool isCallPromotable(CallInst *CI) { … }

bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
    Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
    int OpIdx1) const { … }

bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
    Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { … }

bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { … }

// FIXME: Should try to pick the most likely to be profitable allocas first.
bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
                                                    bool SufficientLDS) { … }
llvm/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp