llvm/llvm/lib/Transforms/IPO/OpenMPOpt.cpp

//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// OpenMP specific optimizations:
//
// - Deduplication of runtime calls, e.g., omp_get_thread_num.
// - Replacing globalized device memory with stack memory.
// - Replacing globalized device memory with shared memory.
// - Parallel region merging.
// - Transforming generic-mode device kernels to SPMD mode.
// - Specializing the state machine for generic-mode device kernels.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/IPO/OpenMPOpt.h"

#include "llvm/ADT/EnumeratedArray.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"

#include <algorithm>
#include <optional>
#include <string>

usingnamespacellvm;
usingnamespaceomp;

#define DEBUG_TYPE

static cl::opt<bool> DisableOpenMPOptimizations(
    "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> EnableParallelRegionMerging(
    "openmp-opt-enable-merging",
    cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
    cl::init(false));

static cl::opt<bool>
    DisableInternalization("openmp-opt-disable-internalization",
                           cl::desc("Disable function internalization."),
                           cl::Hidden, cl::init(false));

static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
                                     cl::init(false), cl::Hidden);
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                    cl::Hidden);
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                        cl::init(false), cl::Hidden);

static cl::opt<bool> HideMemoryTransferLatency(
    "openmp-hide-memory-transfer-latency",
    cl::desc("[WIP] Tries to hide the latency of host to device memory"
             " transfers"),
    cl::Hidden, cl::init(false));

static cl::opt<bool> DisableOpenMPOptDeglobalization(
    "openmp-opt-disable-deglobalization",
    cl::desc("Disable OpenMP optimizations involving deglobalization."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> DisableOpenMPOptSPMDization(
    "openmp-opt-disable-spmdization",
    cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> DisableOpenMPOptFolding(
    "openmp-opt-disable-folding",
    cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
    cl::init(false));

static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
    "openmp-opt-disable-state-machine-rewrite",
    cl::desc("Disable OpenMP optimizations that replace the state machine."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> DisableOpenMPOptBarrierElimination(
    "openmp-opt-disable-barrier-elimination",
    cl::desc("Disable OpenMP optimizations that eliminate barriers."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> PrintModuleAfterOptimizations(
    "openmp-opt-print-module-after",
    cl::desc("Print the current module after OpenMP optimizations."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> PrintModuleBeforeOptimizations(
    "openmp-opt-print-module-before",
    cl::desc("Print the current module before OpenMP optimizations."),
    cl::Hidden, cl::init(false));

static cl::opt<bool> AlwaysInlineDeviceFunctions(
    "openmp-opt-inline-device",
    cl::desc("Inline all applicible functions on the device."), cl::Hidden,
    cl::init(false));

static cl::opt<bool>
    EnableVerboseRemarks("openmp-opt-verbose-remarks",
                         cl::desc("Enables more verbose remarks."), cl::Hidden,
                         cl::init(false));

static cl::opt<unsigned>
    SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
                          cl::desc("Maximal number of attributor iterations."),
                          cl::init(256));

static cl::opt<unsigned>
    SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
                      cl::desc("Maximum amount of shared memory to use."),
                      cl::init(std::numeric_limits<unsigned>::max()));

STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
          "Number of OpenMP runtime calls deduplicated");
STATISTIC(NumOpenMPParallelRegionsDeleted,
          "Number of OpenMP parallel regions deleted");
STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
          "Number of OpenMP runtime functions identified");
STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
          "Number of OpenMP runtime function uses identified");
STATISTIC(NumOpenMPTargetRegionKernels,
          "Number of OpenMP target region entry points (=kernels) identified");
STATISTIC(NumNonOpenMPTargetRegionKernels,
          "Number of non-OpenMP target region kernels identified");
STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
          "Number of OpenMP target region entry points (=kernels) executed in "
          "SPMD-mode instead of generic-mode");
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
          "Number of OpenMP target region entry points (=kernels) executed in "
          "generic-mode without a state machines");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
          "Number of OpenMP target region entry points (=kernels) executed in "
          "generic-mode with customized state machines with fallback");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
          "Number of OpenMP target region entry points (=kernels) executed in "
          "generic-mode with customized state machines without fallback");
STATISTIC(
    NumOpenMPParallelRegionsReplacedInGPUStateMachine,
    "Number of OpenMP parallel regions replaced with ID in GPU state machines");
STATISTIC(NumOpenMPParallelRegionsMerged,
          "Number of OpenMP parallel regions merged");
STATISTIC(NumBytesMovedToSharedMemory,
          "Amount of memory pushed to shared memory");
STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");

#if !defined(NDEBUG)
static constexpr auto TAG = "[" DEBUG_TYPE "]";
#endif

KernelInfo // namespace KernelInfo

namespace {

struct AAHeapToShared;

struct AAICVTracker;

/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
/// Attributor runs.
struct OMPInformationCache : public InformationCache {};

template <typename Ty, bool InsertInvalidates = true>
struct BooleanStateWithSetVector : public BooleanState {};

BooleanStateWithPtrSetVector;

struct KernelInfoState : AbstractState {};

/// Used to map the values physically (in the IR) stored in an offload
/// array, to a vector in memory.
struct OffloadArray {};

struct OpenMPOpt {};

Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {}

bool OpenMPOpt::rewriteDeviceCodeStateMachine() {}

/// Abstract Attribute for tracking ICV values.
struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {};

struct AAICVTrackerFunction : public AAICVTracker {};

struct AAICVTrackerFunctionReturned : AAICVTracker {};

struct AAICVTrackerCallSite : AAICVTracker {};

struct AAICVTrackerCallSiteReturned : AAICVTracker {};

/// Determines if \p BB exits the function unconditionally itself or reaches a
/// block that does through only unique successors.
static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {}

struct AAExecutionDomainFunction : public AAExecutionDomain {};

void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {}

bool AAExecutionDomainFunction::mergeInPredecessor(
    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
    bool InitialEdgeOnly) {}

bool AAExecutionDomainFunction::handleCallees(Attributor &A,
                                              ExecutionDomainTy &EntryBBED) {}

ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {}

/// Try to replace memory allocation calls called by a single thread with a
/// static buffer of shared memory.
struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {};

struct AAHeapToSharedFunction : public AAHeapToShared {};

struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {};

/// The function kernel info abstract attribute, basically, what can we say
/// about a function with regards to the KernelInfoState.
struct AAKernelInfoFunction : AAKernelInfo {};

/// The call site kernel info abstract attribute, basically, what can we say
/// about a call site with regards to the KernelInfoState. For now this simply
/// forwards the information from the callee.
struct AAKernelInfoCallSite : AAKernelInfo {};

struct AAFoldRuntimeCall
    : public StateWrapper<BooleanState, AbstractAttribute> {};

struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {};

} // namespace

/// Register folding callsite
void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {}

void OpenMPOpt::registerAAs(bool IsModulePass) {}

void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {}

const char AAICVTracker::ID =;
const char AAKernelInfo::ID =;
const char AAExecutionDomain::ID =;
const char AAHeapToShared::ID =;
const char AAFoldRuntimeCall::ID =;

AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
                                              Attributor &A) {}

AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
                                                        Attributor &A) {}

AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
                                                  Attributor &A) {}

AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
                                              Attributor &A) {}

AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
                                                        Attributor &A) {}

PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {}

PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
                                          CGSCCAnalysisManager &AM,
                                          LazyCallGraph &CG,
                                          CGSCCUpdateResult &UR) {}

bool llvm::omp::isOpenMPKernel(Function &Fn) {}

KernelSet llvm::omp::getDeviceKernels(Module &M) {}

bool llvm::omp::containsOpenMP(Module &M) {}

bool llvm::omp::isOpenMPDevice(Module &M) {}