VectorCombine.cpp | Explore in Territory

//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass optimizes scalar/vector interactions using target cost models. The
// transforms implemented here may not fit in traditional loop-based or SLP
// vectorization passes.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/VectorCombine.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
#include <queue>

#define DEBUG_TYPE …
#include "llvm/Transforms/Utils/InstructionWorklist.h"

usingnamespacellvm;
usingnamespacellvm::PatternMatch;

STATISTIC(NumVecLoad, "Number of vector loads formed");
STATISTIC(NumVecCmp, "Number of vector compares formed");
STATISTIC(NumVecBO, "Number of vector binops formed");
STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
STATISTIC(NumScalarBO, "Number of scalar binops formed");
STATISTIC(NumScalarCmp, "Number of scalar compares formed");

static cl::opt<bool> DisableVectorCombine(
    "disable-vector-combine", cl::init(false), cl::Hidden,
    cl::desc("Disable all vector combine transforms"));

static cl::opt<bool> DisableBinopExtractShuffle(
    "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
    cl::desc("Disable binop extract to shuffle transforms"));

static cl::opt<unsigned> MaxInstrsToScan(
    "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
    cl::desc("Max number of instructions to scan for vector combining."));

static const unsigned InvalidIndex = …;

namespace {
class VectorCombine { … };
} // namespace

/// Return the source operand of a potentially bitcasted value. If there is no
/// bitcast, return the input value itself.
static Value *peekThroughBitcasts(Value *V) { … }

static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { … }

bool VectorCombine::vectorizeLoadInsert(Instruction &I) { … }

/// If we are loading a vector and then inserting it into a larger vector with
/// undefined elements, try to load the larger vector and eliminate the insert.
/// This removes a shuffle in IR and may allow combining of other loaded values.
bool VectorCombine::widenSubvectorLoad(Instruction &I) { … }

/// Determine which, if any, of the inputs should be replaced by a shuffle
/// followed by extract from a different index.
ExtractElementInst *VectorCombine::getShuffleExtract(
    ExtractElementInst *Ext0, ExtractElementInst *Ext1,
    unsigned PreferredExtractIndex = InvalidIndex) const { … }

/// Compare the relative costs of 2 extracts followed by scalar operation vs.
/// vector operation(s) followed by extract. Return true if the existing
/// instructions are cheaper than a vector alternative. Otherwise, return false
/// and if one of the extracts should be transformed to a shufflevector, set
/// \p ConvertToShuffle to that extract instruction.
bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                          ExtractElementInst *Ext1,
                                          const Instruction &I,
                                          ExtractElementInst *&ConvertToShuffle,
                                          unsigned PreferredExtractIndex) { … }

/// Create a shuffle that translates (shifts) 1 element from the input vector
/// to a new element location.
static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
                                 unsigned NewIndex, IRBuilder<> &Builder) { … }

/// Given an extract element instruction with constant index operand, shuffle
/// the source vector (shift the scalar element) to a NewIndex for extraction.
/// Return null if the input can be constant folded, so that we are not creating
/// unnecessary instructions.
static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
                                            unsigned NewIndex,
                                            IRBuilder<> &Builder) { … }

/// Try to reduce extract element costs by converting scalar compares to vector
/// compares followed by extract.
/// cmp (ext0 V0, C), (ext1 V1, C)
void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
                                  ExtractElementInst *Ext1, Instruction &I) { … }

/// Try to reduce extract element costs by converting scalar binops to vector
/// binops followed by extract.
/// bo (ext0 V0, C), (ext1 V1, C)
void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
                                    ExtractElementInst *Ext1, Instruction &I) { … }

/// Match an instruction with extracted vector operands.
bool VectorCombine::foldExtractExtract(Instruction &I) { … }

/// Try to replace an extract + scalar fneg + insert with a vector fneg +
/// shuffle.
bool VectorCombine::foldInsExtFNeg(Instruction &I) { … }

/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
bool VectorCombine::foldBitcastShuffle(Instruction &I) { … }

/// VP Intrinsics whose vector operands are both splat values may be simplified
/// into the scalar version of the operation and the result splatted. This
/// can lead to scalarization down the line.
bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { … }

/// Match a vector binop or compare instruction with at least one inserted
/// scalar operand and convert to scalar binop/cmp followed by insertelement.
bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { … }

/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
/// a vector into vector operations followed by extract. Note: The SLP pass
/// may miss this pattern because of implementation problems.
bool VectorCombine::foldExtractedCmps(Instruction &I) { … }

// Check if memory loc modified between two instrs in the same BB
static bool isMemModifiedBetween(BasicBlock::iterator Begin,
                                 BasicBlock::iterator End,
                                 const MemoryLocation &Loc, AAResults &AA) { … }

namespace {
/// Helper class to indicate whether a vector index can be safely scalarized and
/// if a freeze needs to be inserted.
class ScalarizationResult { … };
} // namespace

/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
/// Idx. \p Idx must access a valid vector element.
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
                                              Instruction *CtxI,
                                              AssumptionCache &AC,
                                              const DominatorTree &DT) { … }

/// The memory operation on a vector of \p ScalarType had alignment of
/// \p VectorAlignment. Compute the maximal, but conservatively correct,
/// alignment that will be valid for the memory operation on a single scalar
/// element of the same type with index \p Idx.
static Align computeAlignmentAfterScalarization(Align VectorAlignment,
                                                Type *ScalarType, Value *Idx,
                                                const DataLayout &DL) { … }

// Combine patterns like:
//   %0 = load <4 x i32>, <4 x i32>* %a
//   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
//   store <4 x i32> %1, <4 x i32>* %a
// to:
//   %0 = bitcast <4 x i32>* %a to i32*
//   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
//   store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) { … }

/// Try to scalarize vector loads feeding extractelement instructions.
bool VectorCombine::scalarizeLoadExtract(Instruction &I) { … }

/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfBinops(Instruction &I) { … }

/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
/// into "castop (shuffle)".
bool VectorCombine::foldShuffleOfCastops(Instruction &I) { … }

/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
/// into "shuffle x, y".
bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { … }

/// Try to convert
/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) { … }

InstLane;

static InstLane lookThroughShuffles(Use *U, int Lane) { … }

static SmallVector<InstLane>
generateInstLaneVectorFromOperand(ArrayRef<InstLane> Item, int Op) { … }

/// Detect concat of multiple values into a vector
static bool isFreeConcat(ArrayRef<InstLane> Item,
                         const TargetTransformInfo &TTI) { … }

static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
                                  const SmallPtrSet<Use *, 4> &IdentityLeafs,
                                  const SmallPtrSet<Use *, 4> &SplatLeafs,
                                  const SmallPtrSet<Use *, 4> &ConcatLeafs,
                                  IRBuilder<> &Builder) { … }

// Starting from a shuffle, look up through operands tracking the shuffled index
// of each lane. If we can simplify away the shuffles to identities then
// do so.
bool VectorCombine::foldShuffleToIdentity(Instruction &I) { … }

/// Given a commutative reduction, the order of the input lanes does not alter
/// the results. We can use this to remove certain shuffles feeding the
/// reduction, removing the need to shuffle at all.
bool VectorCombine::foldShuffleFromReductions(Instruction &I) { … }

/// Determine if its more efficient to fold:
///   reduce(trunc(x)) -> trunc(reduce(x)).
///   reduce(sext(x))  -> sext(reduce(x)).
///   reduce(zext(x))  -> zext(reduce(x)).
bool VectorCombine::foldCastFromReductions(Instruction &I) { … }

/// This method looks for groups of shuffles acting on binops, of the form:
///  %x = shuffle ...
///  %y = shuffle ...
///  %a = binop %x, %y
///  %b = binop %x, %y
///  shuffle %a, %b, selectmask
/// We may, especially if the shuffle is wider than legal, be able to convert
/// the shuffle to a form where only parts of a and b need to be computed. On
/// architectures with no obvious "select" shuffle, this can reduce the total
/// number of operations if the target reports them as cheaper.
bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { … }

/// Check if instruction depends on ZExt and this ZExt can be moved after the
/// instruction. Move ZExt if it is profitable. For example:
///     logic(zext(x),y) -> zext(logic(x,trunc(y)))
///     lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
/// Cost model calculations takes into account if zext(x) has other users and
/// whether it can be propagated through them too.
bool VectorCombine::shrinkType(llvm::Instruction &I) { … }

/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
bool VectorCombine::run() { … }

PreservedAnalyses VectorCombinePass::run(Function &F,
                                         FunctionAnalysisManager &FAM) { … }
llvm/llvm/lib/Transforms/Vectorize/VectorCombine.cpp