#include "llvm/Transforms/IPO/OpenMPOpt.h"
#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include <algorithm>
#include <optional>
#include <string>
usingnamespacellvm;
usingnamespaceomp;
#define DEBUG_TYPE …
static cl::opt<bool> DisableOpenMPOptimizations(
"openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
cl::Hidden, cl::init(false));
static cl::opt<bool> EnableParallelRegionMerging(
"openmp-opt-enable-merging",
cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
cl::init(false));
static cl::opt<bool>
DisableInternalization("openmp-opt-disable-internalization",
cl::desc("Disable function internalization."),
cl::Hidden, cl::init(false));
static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
cl::init(false), cl::Hidden);
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
cl::Hidden);
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
cl::init(false), cl::Hidden);
static cl::opt<bool> HideMemoryTransferLatency(
"openmp-hide-memory-transfer-latency",
cl::desc("[WIP] Tries to hide the latency of host to device memory"
" transfers"),
cl::Hidden, cl::init(false));
static cl::opt<bool> DisableOpenMPOptDeglobalization(
"openmp-opt-disable-deglobalization",
cl::desc("Disable OpenMP optimizations involving deglobalization."),
cl::Hidden, cl::init(false));
static cl::opt<bool> DisableOpenMPOptSPMDization(
"openmp-opt-disable-spmdization",
cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
cl::Hidden, cl::init(false));
static cl::opt<bool> DisableOpenMPOptFolding(
"openmp-opt-disable-folding",
cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
cl::init(false));
static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
"openmp-opt-disable-state-machine-rewrite",
cl::desc("Disable OpenMP optimizations that replace the state machine."),
cl::Hidden, cl::init(false));
static cl::opt<bool> DisableOpenMPOptBarrierElimination(
"openmp-opt-disable-barrier-elimination",
cl::desc("Disable OpenMP optimizations that eliminate barriers."),
cl::Hidden, cl::init(false));
static cl::opt<bool> PrintModuleAfterOptimizations(
"openmp-opt-print-module-after",
cl::desc("Print the current module after OpenMP optimizations."),
cl::Hidden, cl::init(false));
static cl::opt<bool> PrintModuleBeforeOptimizations(
"openmp-opt-print-module-before",
cl::desc("Print the current module before OpenMP optimizations."),
cl::Hidden, cl::init(false));
static cl::opt<bool> AlwaysInlineDeviceFunctions(
"openmp-opt-inline-device",
cl::desc("Inline all applicible functions on the device."), cl::Hidden,
cl::init(false));
static cl::opt<bool>
EnableVerboseRemarks("openmp-opt-verbose-remarks",
cl::desc("Enables more verbose remarks."), cl::Hidden,
cl::init(false));
static cl::opt<unsigned>
SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
cl::desc("Maximal number of attributor iterations."),
cl::init(256));
static cl::opt<unsigned>
SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
cl::desc("Maximum amount of shared memory to use."),
cl::init(std::numeric_limits<unsigned>::max()));
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
"Number of OpenMP runtime calls deduplicated");
STATISTIC(NumOpenMPParallelRegionsDeleted,
"Number of OpenMP parallel regions deleted");
STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
"Number of OpenMP runtime functions identified");
STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
"Number of OpenMP runtime function uses identified");
STATISTIC(NumOpenMPTargetRegionKernels,
"Number of OpenMP target region entry points (=kernels) identified");
STATISTIC(NumNonOpenMPTargetRegionKernels,
"Number of non-OpenMP target region kernels identified");
STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
"Number of OpenMP target region entry points (=kernels) executed in "
"SPMD-mode instead of generic-mode");
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback");
STATISTIC(
NumOpenMPParallelRegionsReplacedInGPUStateMachine,
"Number of OpenMP parallel regions replaced with ID in GPU state machines");
STATISTIC(NumOpenMPParallelRegionsMerged,
"Number of OpenMP parallel regions merged");
STATISTIC(NumBytesMovedToSharedMemory,
"Amount of memory pushed to shared memory");
STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");
#if !defined(NDEBUG)
static constexpr auto TAG = "[" DEBUG_TYPE "]";
#endif
KernelInfo
namespace {
struct AAHeapToShared;
struct AAICVTracker;
struct OMPInformationCache : public InformationCache { … };
template <typename Ty, bool InsertInvalidates = true>
struct BooleanStateWithSetVector : public BooleanState { … };
BooleanStateWithPtrSetVector;
struct KernelInfoState : AbstractState { … };
struct OffloadArray { … };
struct OpenMPOpt { … };
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { … }
bool OpenMPOpt::rewriteDeviceCodeStateMachine() { … }
struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> { … };
struct AAICVTrackerFunction : public AAICVTracker { … };
struct AAICVTrackerFunctionReturned : AAICVTracker { … };
struct AAICVTrackerCallSite : AAICVTracker { … };
struct AAICVTrackerCallSiteReturned : AAICVTracker { … };
static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) { … }
struct AAExecutionDomainFunction : public AAExecutionDomain { … };
void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) { … }
bool AAExecutionDomainFunction::mergeInPredecessor(
Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
bool InitialEdgeOnly) { … }
bool AAExecutionDomainFunction::handleCallees(Attributor &A,
ExecutionDomainTy &EntryBBED) { … }
ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) { … }
struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> { … };
struct AAHeapToSharedFunction : public AAHeapToShared { … };
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> { … };
struct AAKernelInfoFunction : AAKernelInfo { … };
struct AAKernelInfoCallSite : AAKernelInfo { … };
struct AAFoldRuntimeCall
: public StateWrapper<BooleanState, AbstractAttribute> { … };
struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall { … };
}
void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) { … }
void OpenMPOpt::registerAAs(bool IsModulePass) { … }
void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) { … }
const char AAICVTracker::ID = …;
const char AAKernelInfo::ID = …;
const char AAExecutionDomain::ID = …;
const char AAHeapToShared::ID = …;
const char AAFoldRuntimeCall::ID = …;
AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
Attributor &A) { … }
AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
Attributor &A) { … }
AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
Attributor &A) { … }
AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
Attributor &A) { … }
AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
Attributor &A) { … }
PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { … }
PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
CGSCCAnalysisManager &AM,
LazyCallGraph &CG,
CGSCCUpdateResult &UR) { … }
bool llvm::omp::isOpenMPKernel(Function &Fn) { … }
KernelSet llvm::omp::getDeviceKernels(Module &M) { … }
bool llvm::omp::containsOpenMP(Module &M) { … }
bool llvm::omp::isOpenMPDevice(Module &M) { … }