LoadStoreVectorizer.cpp | Explore in Territory

//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass merges loads/stores to/from sequential memory addresses into vector
// loads/stores.  Although there's nothing GPU-specific in here, this pass is
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
//
// (For simplicity below we talk about loads only, but everything also applies
// to stores.)
//
// This pass is intended to be run late in the pipeline, after other
// vectorization opportunities have been exploited.  So the assumption here is
// that immediately following our new vector load we'll need to extract out the
// individual elements of the load, so we can operate on them individually.
//
// On CPUs this transformation is usually not beneficial, because extracting the
// elements of a vector register is expensive on most architectures.  It's
// usually better just to load each element individually into its own scalar
// register.
//
// However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
// "vector load" loads directly into a series of scalar registers.  In effect,
// extracting the elements of the vector is free.  It's therefore always
// beneficial to vectorize a sequence of loads on these architectures.
//
// Vectorizing (perhaps a better name might be "coalescing") loads can have
// large performance impacts on GPU kernels, and opportunities for vectorizing
// are common in GPU code.  This pass tries very hard to find such
// opportunities; its runtime is quadratic in the number of loads in a BB.
//
// Some CPU architectures, such as ARM, have instructions that load into
// multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
// could use this pass (with some modifications), but currently it implements
// its own pass to do something similar to what we do here.
//
// Overview of the algorithm and terminology in this pass:
//
//  - Break up each basic block into pseudo-BBs, composed of instructions which
//    are guaranteed to transfer control to their successors.
//  - Within a single pseudo-BB, find all loads, and group them into
//    "equivalence classes" according to getUnderlyingObject() and loaded
//    element size.  Do the same for stores.
//  - For each equivalence class, greedily build "chains".  Each chain has a
//    leader instruction, and every other member of the chain has a known
//    constant offset from the first instr in the chain.
//  - Break up chains so that they contain only contiguous accesses of legal
//    size with no intervening may-alias instrs.
//  - Convert each chain to vector instructions.
//
// The O(n^2) behavior of this pass comes from initially building the chains.
// In the worst case we have to compare each new instruction to all of those
// that came before. To limit this, we only calculate the offset to the leaders
// of the N most recently-used chains.

#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <numeric>
#include <optional>
#include <tuple>
#include <type_traits>
#include <utility>
#include <vector>

usingnamespacellvm;

#define DEBUG_TYPE …

STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");

namespace {

// Equivalence class key, the initial tuple by which we group loads/stores.
// Loads/stores with different EqClassKeys are never merged.
//
// (We could in theory remove element-size from the this tuple.  We'd just need
// to fix up the vector packing/unpacking code.)
EqClassKey;
[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                               const EqClassKey &K) { … }

// A Chain is a set of instructions such that:
//  - All instructions have the same equivalence class, so in particular all are
//    loads, or all are stores.
//  - We know the address accessed by the i'th chain elem relative to the
//    chain's leader instruction, which is the first instr of the chain in BB
//    order.
//
// Chains have two canonical orderings:
//  - BB order, sorted by Instr->comesBefore.
//  - Offset order, sorted by OffsetFromLeader.
// This pass switches back and forth between these orders.
struct ChainElem { … };
Chain;

void sortChainInBBOrder(Chain &C) { … }

void sortChainInOffsetOrder(Chain &C) { … }

[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) { … }

EquivalenceClassMap;

// FIXME: Assuming stack alignment of 4 is always good enough
constexpr unsigned StackAdjustedAlignment = …;

Instruction *propagateMetadata(Instruction *I, const Chain &C) { … }

bool isInvariantLoad(const Instruction *I) { … }

/// Reorders the instructions that I depends on (the instructions defining its
/// operands), to ensure they dominate I.
void reorder(Instruction *I) { … }

class Vectorizer { … };

class LoadStoreVectorizerLegacyPass : public FunctionPass { … };

} // end anonymous namespace

char LoadStoreVectorizerLegacyPass::ID = …;

INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                      "Vectorize load and Store instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
                    "Vectorize load and store instructions", false, false)

Pass *llvm::createLoadStoreVectorizerPass() { … }

bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) { … }

PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
                                               FunctionAnalysisManager &AM) { … }

bool Vectorizer::run() { … }

bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
                               BasicBlock::iterator End) { … }

bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
                                       ArrayRef<Instruction *> EqClass) { … }

bool Vectorizer::runOnChain(Chain &C) { … }

std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) { … }

std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) { … }

Type *Vectorizer::getChainElemTy(const Chain &C) { … }

std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) { … }

bool Vectorizer::vectorizeChain(Chain &C) { … }

template <bool IsLoadChain>
bool Vectorizer::isSafeToMove(
    Instruction *ChainElem, Instruction *ChainBegin,
    const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) { … }

static bool checkNoWrapFlags(Instruction *I, bool Signed) { … }

static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
                                   unsigned MatchingOpIdxA, Instruction *AddOpB,
                                   unsigned MatchingOpIdxB, bool Signed) { … }

std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
    Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { … }

std::optional<APInt> Vectorizer::getConstantOffsetSelects(
    Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) { … }

EquivalenceClassMap
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
                                      BasicBlock::iterator End) { … }

std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) { … }

std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
                                                   Instruction *ContextInst,
                                                   unsigned Depth) { … }
llvm/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp