llvm/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp

//===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to prepare loops for ppc preferred addressing
// modes, leveraging different instruction form. (eg: DS/DQ form, D/DS form with
// update)
// Additional PHIs are created for loop induction variables used by load/store
// instructions so that preferred addressing modes can be used.
//
// 1: DS/DQ form preparation, prepare the load/store instructions so that they
//    can satisfy the DS/DQ form displacement requirements.
//    Generically, this means transforming loops like this:
//    for (int i = 0; i < n; ++i) {
//      unsigned long x1 = *(unsigned long *)(p + i + 5);
//      unsigned long x2 = *(unsigned long *)(p + i + 9);
//    }
//
//    to look like this:
//
//    unsigned NewP = p + 5;
//    for (int i = 0; i < n; ++i) {
//      unsigned long x1 = *(unsigned long *)(i + NewP);
//      unsigned long x2 = *(unsigned long *)(i + NewP + 4);
//    }
//
// 2: D/DS form with update preparation, prepare the load/store instructions so
//    that we can use update form to do pre-increment.
//    Generically, this means transforming loops like this:
//    for (int i = 0; i < n; ++i)
//      array[i] = c;
//
//    to look like this:
//
//    T *p = array[-1];
//    for (int i = 0; i < n; ++i)
//      *++p = c;
//
// 3: common multiple chains for the load/stores with same offsets in the loop,
//    so that we can reuse the offsets and reduce the register pressure in the
//    loop. This transformation can also increase the loop ILP as now each chain
//    uses its own loop induction add/addi. But this will increase the number of
//    add/addi in the loop.
//
//    Generically, this means transforming loops like this:
//
//    char *p;
//    A1 = p + base1
//    A2 = p + base1 + offset
//    B1 = p + base2
//    B2 = p + base2 + offset
//
//    for (int i = 0; i < n; i++)
//      unsigned long x1 = *(unsigned long *)(A1 + i);
//      unsigned long x2 = *(unsigned long *)(A2 + i)
//      unsigned long x3 = *(unsigned long *)(B1 + i);
//      unsigned long x4 = *(unsigned long *)(B2 + i);
//    }
//
//    to look like this:
//
//    A1_new = p + base1 // chain 1
//    B1_new = p + base2 // chain 2, now inside the loop, common offset is
//                       // reused.
//
//    for (long long i = 0; i < n; i+=count) {
//      unsigned long x1 = *(unsigned long *)(A1_new + i);
//      unsigned long x2 = *(unsigned long *)((A1_new + i) + offset);
//      unsigned long x3 = *(unsigned long *)(B1_new + i);
//      unsigned long x4 = *(unsigned long *)((B1_new + i) + offset);
//    }
//===----------------------------------------------------------------------===//

#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsPowerPC.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
#include <cassert>
#include <cmath>
#include <iterator>
#include <utility>

#define DEBUG_TYPE

usingnamespacellvm;

static cl::opt<unsigned>
    MaxVarsPrep("ppc-formprep-max-vars", cl::Hidden, cl::init(24),
                cl::desc("Potential common base number threshold per function "
                         "for PPC loop prep"));

static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update",
                                 cl::init(true), cl::Hidden,
  cl::desc("prefer update form when ds form is also a update form"));

static cl::opt<bool> EnableUpdateFormForNonConstInc(
    "ppc-formprep-update-nonconst-inc", cl::init(false), cl::Hidden,
    cl::desc("prepare update form when the load/store increment is a loop "
             "invariant non-const value."));

static cl::opt<bool> EnableChainCommoning(
    "ppc-formprep-chain-commoning", cl::init(false), cl::Hidden,
    cl::desc("Enable chain commoning in PPC loop prepare pass."));

// Sum of following 3 per loop thresholds for all loops can not be larger
// than MaxVarsPrep.
// now the thresholds for each kind prep are exterimental values on Power9.
static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars",
                                 cl::Hidden, cl::init(3),
  cl::desc("Potential PHI threshold per loop for PPC loop prep of update "
           "form"));

static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars",
                                 cl::Hidden, cl::init(3),
  cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form"));

static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars",
                                 cl::Hidden, cl::init(8),
  cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form"));

// Commoning chain will reduce the register pressure, so we don't consider about
// the PHI nodes number.
// But commoning chain will increase the addi/add number in the loop and also
// increase loop ILP. Maximum chain number should be same with hardware
// IssueWidth, because we won't benefit from ILP if the parallel chains number
// is bigger than IssueWidth. We assume there are 2 chains in one bucket, so
// there would be 4 buckets at most on P9(IssueWidth is 8).
static cl::opt<unsigned> MaxVarsChainCommon(
    "ppc-chaincommon-max-vars", cl::Hidden, cl::init(4),
    cl::desc("Bucket number per loop for PPC loop chain common"));

// If would not be profitable if the common base has only one load/store, ISEL
// should already be able to choose best load/store form based on offset for
// single load/store. Set minimal profitable value default to 2 and make it as
// an option.
static cl::opt<unsigned> DispFormPrepMinThreshold("ppc-dispprep-min-threshold",
                                    cl::Hidden, cl::init(2),
  cl::desc("Minimal common base load/store instructions triggering DS/DQ form "
           "preparation"));

static cl::opt<unsigned> ChainCommonPrepMinThreshold(
    "ppc-chaincommon-min-threshold", cl::Hidden, cl::init(4),
    cl::desc("Minimal common base load/store instructions triggering chain "
             "commoning preparation. Must be not smaller than 4"));

STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form");
STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form");
STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form");
STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten");
STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten");
STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten");
STATISTIC(ChainCommoningRewritten, "Num of commoning chains");

namespace {
  struct BucketElement {};

  struct Bucket {};

  // "UpdateForm" is not a real PPC instruction form, it stands for dform
  // load/store with update like ldu/stdu, or Prefetch intrinsic.
  // For DS form instructions, their displacements must be multiple of 4.
  // For DQ form instructions, their displacements must be multiple of 16.
  enum PrepForm {};

  class PPCLoopInstrFormPrep : public FunctionPass {};

} // end anonymous namespace

char PPCLoopInstrFormPrep::ID =;
static const char *name =;
INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)

static constexpr StringRef PHINodeNameSuffix    =;
static constexpr StringRef CastNodeNameSuffix   =;
static constexpr StringRef GEPNodeIncNameSuffix =;
static constexpr StringRef GEPNodeOffNameSuffix =;

FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) {}

static bool IsPtrInBounds(Value *BasePtr) {}

static std::string getInstrName(const Value *I, StringRef Suffix) {}

static Value *getPointerOperandAndType(Value *MemI,
                                       Type **PtrElementType = nullptr) {}

bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {}

// Finding the minimal(chain_number + reusable_offset_number) is a complicated
// algorithmic problem.
// For now, the algorithm used here is simply adjusted to handle the case for
// manually unrolling cases.
// FIXME: use a more powerful algorithm to find minimal sum of chain_number and
// reusable_offset_number for one base with multiple offsets.
bool PPCLoopInstrFormPrep::prepareBasesForCommoningChains(Bucket &CBucket) {}

bool PPCLoopInstrFormPrep::chainCommoning(Loop *L,
                                          SmallVector<Bucket, 16> &Buckets) {}

bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
    Loop *L, Bucket &Bucket, SmallSet<BasicBlock *, 16> &BBChanged) {}

// Rewrite the new base according to BasePtrSCEV.
// bb.loop.preheader:
//   %newstart = ...
// bb.loop.body:
//   %phinode = phi [ %newstart, %bb.loop.preheader ], [ %add, %bb.loop.body ]
//   ...
//   %add = getelementptr %phinode, %inc
//
// First returned instruciton is %phinode (or a type cast to %phinode), caller
// needs this value to rewrite other load/stores in the same chain.
// Second returned instruction is %add, caller needs this value to rewrite other
// load/stores in the same chain.
std::pair<Instruction *, Instruction *>
PPCLoopInstrFormPrep::rewriteForBase(Loop *L, const SCEVAddRecExpr *BasePtrSCEV,
                                     Instruction *BaseMemI, bool CanPreInc,
                                     PrepForm Form, SCEVExpander &SCEVE,
                                     SmallPtrSet<Value *, 16> &DeletedPtrs) {}

Instruction *PPCLoopInstrFormPrep::rewriteForBucketElement(
    std::pair<Instruction *, Instruction *> Base, const BucketElement &Element,
    Value *OffToBase, SmallPtrSet<Value *, 16> &DeletedPtrs) {}

void PPCLoopInstrFormPrep::addOneCandidate(
    Instruction *MemI, const SCEV *LSCEV, SmallVector<Bucket, 16> &Buckets,
    std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {}

SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
    Loop *L,
    std::function<bool(const Instruction *, Value *, const Type *)>
        isValidCandidate,
    std::function<bool(const SCEV *)> isValidDiff, unsigned MaxCandidateNum) {}

bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
                                                       PrepForm Form) {}

// FIXME: implement a more clever base choosing policy.
// Currently we always choose an exist load/store offset. This maybe lead to
// suboptimal code sequences. For example, for one DS chain with offsets
// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
// multipler of 4, it cannot be represented by sint16.
bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {}

bool PPCLoopInstrFormPrep::rewriteLoadStores(
    Loop *L, Bucket &BucketChain, SmallSet<BasicBlock *, 16> &BBChanged,
    PrepForm Form) {}

bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
                                       SmallVector<Bucket, 16> &Buckets) {}

bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L,
                                        SmallVector<Bucket, 16> &Buckets,
                                        PrepForm Form) {}

// Find the loop invariant increment node for SCEV BasePtrIncSCEV.
// bb.loop.preheader:
//   %start = ...
// bb.loop.body:
//   %phinode = phi [ %start, %bb.loop.preheader ], [ %add, %bb.loop.body ]
//   ...
//   %add = add %phinode, %inc  ; %inc is what we want to get.
//
Value *PPCLoopInstrFormPrep::getNodeForInc(Loop *L, Instruction *MemI,
                                           const SCEV *BasePtrIncSCEV) {}

// In order to prepare for the preferred instruction form, a PHI is added.
// This function will check to see if that PHI already exists and will return
// true if it found an existing PHI with the matched start and increment as the
// one we wanted to create.
bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction *MemI,
                                           const SCEV *BasePtrStartSCEV,
                                           const SCEV *BasePtrIncSCEV,
                                           PrepForm Form) {}

bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {}