AMDGPUCodeGenPrepare.cpp | Explore in Territory

//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass does misc. AMDGPU optimizations on IR before instruction
/// selection.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
#include "llvm/Transforms/Utils/Local.h"

#define DEBUG_TYPE …

usingnamespacellvm;
usingnamespacellvm::PatternMatch;

namespace {

static cl::opt<bool> WidenLoads(
  "amdgpu-codegenprepare-widen-constant-loads",
  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(false));

static cl::opt<bool> Widen16BitOps(
  "amdgpu-codegenprepare-widen-16-bit-ops",
  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(true));

static cl::opt<bool>
    BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
                   cl::desc("Break large PHI nodes for DAGISel"),
                   cl::ReallyHidden, cl::init(true));

static cl::opt<bool>
    ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
                        cl::desc("For testing purposes, always break large "
                                 "PHIs even if it isn't profitable."),
                        cl::ReallyHidden, cl::init(false));

static cl::opt<unsigned> BreakLargePHIsThreshold(
    "amdgpu-codegenprepare-break-large-phis-threshold",
    cl::desc("Minimum type size in bits for breaking large PHI nodes"),
    cl::ReallyHidden, cl::init(32));

static cl::opt<bool> UseMul24Intrin(
  "amdgpu-codegenprepare-mul24",
  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(true));

// Legalize 64-bit division by using the generic IR expansion.
static cl::opt<bool> ExpandDiv64InIR(
  "amdgpu-codegenprepare-expand-div64",
  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(false));

// Leave all division operations as they are. This supersedes ExpandDiv64InIR
// and is used for testing the legalizer.
static cl::opt<bool> DisableIDivExpand(
  "amdgpu-codegenprepare-disable-idiv-expansion",
  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(false));

// Disable processing of fdiv so we can better test the backend implementations.
static cl::opt<bool> DisableFDivExpand(
  "amdgpu-codegenprepare-disable-fdiv-expansion",
  cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
  cl::ReallyHidden,
  cl::init(false));

class AMDGPUCodeGenPrepareImpl
    : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { … };

class AMDGPUCodeGenPrepare : public FunctionPass { … };

} // end anonymous namespace

bool AMDGPUCodeGenPrepareImpl::run(Function &F) { … }

unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { … }

Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { … }

bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { … }

bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { … }

bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { … }

bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { … }

// Return true if the op promoted to i32 should have nsw set.
static bool promotedOpIsNSW(const Instruction &I) { … }

// Return true if the op promoted to i32 should have nuw set.
static bool promotedOpIsNUW(const Instruction &I) { … }

bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { … }

bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { … }

bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { … }

bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { … }

bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
    IntrinsicInst &I) const { … }

unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { … }

unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { … }

static void extractValues(IRBuilder<> &Builder,
                          SmallVectorImpl<Value *> &Values, Value *V) { … }

static Value *insertValues(IRBuilder<> &Builder,
                           Type *Ty,
                           SmallVectorImpl<Value *> &Values) { … }

bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { … }

// Find a select instruction, which may have been casted. This is mostly to deal
// with cases where i16 selects were promoted here to i32.
static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { … }

bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { … }

std::pair<Value *, Value *>
AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
                                          Value *Src) const { … }

/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
                                                 Value *Src,
                                                 bool IsNegative) const { … }

/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
                                              Value *RHS,
                                              FastMathFlags FMF) const { … }

/// Emit a sqrt that handles denormals and is accurate to 2ulp.
Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
                                                  Value *Src,
                                                  FastMathFlags FMF) const { … }

/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
                              bool IsNegative) { … }

bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
                                                  FastMathFlags DivFMF,
                                                  FastMathFlags SqrtFMF) const { … }

Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
    IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
    const FastMathFlags SqrtFMF, const Instruction *CtxI) const { … }

// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
//               allowed with unsafe-fp-math or afn.
//
// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
Value *
AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
                                          Value *Den, FastMathFlags FMF,
                                          const Instruction *CtxI) const { … }

// optimize with fdiv.fast:
//
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
//
// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
//
// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
    IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const { … }

Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
    IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
    FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
    float ReqdDivAccuracy) const { … }

// Optimizations is performed based on fpmath, fast math flags as well as
// denormals to optimize fdiv with either rcp or fdiv.fast.
//
// With rcp:
//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
//                 allowed with unsafe-fp-math or afn.
//
//   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
//
// With fdiv.fast:
//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
//
//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
//
// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { … }

static bool hasUnsafeFPMath(const Function &F) { … }

static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
                                          Value *LHS, Value *RHS) { … }

static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { … }

/// Figure out how many bits are really needed for this division. \p AtLeast is
/// an optimization hint to bypass the second ComputeNumSignBits call if we the
/// first one is insufficient. Returns -1 on failure.
int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
                                            Value *Den, unsigned AtLeast,
                                            bool IsSigned) const { … }

// The fractional part of a float is enough to accurately represent up to
// a 24-bit signed integer.
Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
                                                BinaryOperator &I, Value *Num,
                                                Value *Den, bool IsDiv,
                                                bool IsSigned) const { … }

Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
    IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
    unsigned DivBits, bool IsDiv, bool IsSigned) const { … }

// Try to recognize special cases the DAG will emit special, better expansions
// than the general expansion we do here.

// TODO: It would be better to just directly handle those optimizations here.
bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
                                                         Value *Num,
                                                         Value *Den) const { … }

static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { … }

Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
                                                BinaryOperator &I, Value *X,
                                                Value *Y) const { … }

Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
                                                BinaryOperator &I, Value *Num,
                                                Value *Den) const { … }

void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { … }

bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { … }

bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { … }

bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { … }

bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { … }

static bool areInSameBB(const Value *A, const Value *B) { … }

// Helper for breaking large PHIs that returns true when an extractelement on V
// is likely to be folded away by the DAG combiner.
static bool isInterestingPHIIncomingValue(const Value *V) { … }

static void collectPHINodes(const PHINode &I,
                            SmallPtrSet<const PHINode *, 8> &SeenPHIs) { … }

bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) { … }

/// Helper class for "break large PHIs" (visitPHINode).
///
/// This represents a slice of a PHI's incoming value, which is made up of:
///   - The type of the slice (Ty)
///   - The index in the incoming value's vector where the slice starts (Idx)
///   - The number of elements in the slice (NumElts).
/// It also keeps track of the NewPHI node inserted for this particular slice.
///
/// Slice examples:
///   <4 x i64> -> Split into four i64 slices.
///     -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
///   <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
///     -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
class VectorSlice { … };

bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { … }

/// \param V  Value to check
/// \param DL DataLayout
/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
/// \param AS Target Address Space
/// \return true if \p V cannot be the null value of \p AS, false otherwise.
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
                                const AMDGPUTargetMachine &TM, unsigned AS) { … }

bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { … }

bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { … }

bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { … }

/// Match non-nan fract pattern.
///   minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
///
/// If fract is a useful instruction for the subtarget. Does not account for the
/// nan handling; the instruction has a nan check on the input value.
Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) { … }

Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
                                               Value *FractArg) { … }

bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) { … }

static bool isOneOrNegOne(const Value *Val) { … }

// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { … }

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { … }

bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { … }

PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
                                                FunctionAnalysisManager &FAM) { … }

INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                      "AMDGPU IR optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
                    false, false)

char AMDGPUCodeGenPrepare::ID = …;

FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { … }
llvm/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp