//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This provides a generalized class for OpenMP runtime code generation // specialized by GPU targets NVPTX and AMDGCN. // //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeGPU.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/DeclOpenMP.h" #include "clang/AST/OpenMPClause.h" #include "clang/AST/StmtOpenMP.h" #include "clang/AST/StmtVisitor.h" #include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" #include "llvm/Support/MathExtras.h" usingnamespaceclang; usingnamespaceCodeGen; usingnamespacellvm::omp; namespace { /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. class NVPTXActionTy final : public PrePostActionTy { … }; /// A class to track the execution mode when codegening directives within /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry /// to the target region and used by containing directives such as 'parallel' /// to emit optimized code. class ExecutionRuntimeModesRAII { … }; static const ValueDecl *getPrivateItem(const Expr *RefExpr) { … } static RecordDecl *buildRecordForGlobalizedVars( ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, ArrayRef<const ValueDecl *> EscapedDeclsForTeams, llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> &MappedDeclsFields, int BufSize) { … } /// Get the list of variables that can escape their declaration context. class CheckVarsEscapingDeclContext final : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { … }; } // anonymous namespace CGOpenMPRuntimeGPU::ExecutionMode CGOpenMPRuntimeGPU::getExecutionMode() const { … } CGOpenMPRuntimeGPU::DataSharingMode CGOpenMPRuntimeGPU::getDataSharingMode() const { … } /// Check for inner (nested) SPMD construct, if any static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D) { … } static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D) { … } void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { … } void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF, EntryFunctionState &EST, bool IsSPMD) { … } void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST, bool IsSPMD) { … } void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { … } void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { … } CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) : … { … } void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, ProcBindKind ProcBind, SourceLocation Loc) { … } void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) { … } void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) { … } llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { … } /// Get list of lastprivate variables from the teams distribute ... or /// teams {distribute ...} directives. static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl<const ValueDecl *> &Vars) { … } /// Get list of reduction variables from the teams ... directives. static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl<const ValueDecl *> &Vars) { … } llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { … } void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc) { … } bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const { … } std::pair<llvm::Value *, llvm::Value *> CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) { … } void CGOpenMPRuntimeGPU::getKmpcFreeShared( CodeGenFunction &CGF, const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { … } void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { … } void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars) { … } void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) { … } void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { … } void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool, bool) { … } void CGOpenMPRuntimeGPU::emitCriticalRegion( CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint) { … } /// Cast value to the specified type. static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc) { … } /// /// Design of OpenMP reductions on the GPU /// /// Consider a typical OpenMP program with one or more reduction /// clauses: /// /// float foo; /// double bar; /// #pragma omp target teams distribute parallel for \ /// reduction(+:foo) reduction(*:bar) /// for (int i = 0; i < N; i++) { /// foo += A[i]; bar *= B[i]; /// } /// /// where 'foo' and 'bar' are reduced across all OpenMP threads in /// all teams. In our OpenMP implementation on the NVPTX device an /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads /// within a team are mapped to CUDA threads within a threadblock. /// Our goal is to efficiently aggregate values across all OpenMP /// threads such that: /// /// - the compiler and runtime are logically concise, and /// - the reduction is performed efficiently in a hierarchical /// manner as follows: within OpenMP threads in the same warp, /// across warps in a threadblock, and finally across teams on /// the NVPTX device. /// /// Introduction to Decoupling /// /// We would like to decouple the compiler and the runtime so that the /// latter is ignorant of the reduction variables (number, data types) /// and the reduction operators. This allows a simpler interface /// and implementation while still attaining good performance. /// /// Pseudocode for the aforementioned OpenMP program generated by the /// compiler is as follows: /// /// 1. Create private copies of reduction variables on each OpenMP /// thread: 'foo_private', 'bar_private' /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned /// to it and writes the result in 'foo_private' and 'bar_private' /// respectively. /// 3. Call the OpenMP runtime on the GPU to reduce within a team /// and store the result on the team master: /// /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., /// reduceData, shuffleReduceFn, interWarpCpyFn) /// /// where: /// struct ReduceData { /// double *foo; /// double *bar; /// } reduceData /// reduceData.foo = &foo_private /// reduceData.bar = &bar_private /// /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two /// auxiliary functions generated by the compiler that operate on /// variables of type 'ReduceData'. They aid the runtime perform /// algorithmic steps in a data agnostic manner. /// /// 'shuffleReduceFn' is a pointer to a function that reduces data /// of type 'ReduceData' across two OpenMP threads (lanes) in the /// same warp. It takes the following arguments as input: /// /// a. variable of type 'ReduceData' on the calling lane, /// b. its lane_id, /// c. an offset relative to the current lane_id to generate a /// remote_lane_id. The remote lane contains the second /// variable of type 'ReduceData' that is to be reduced. /// d. an algorithm version parameter determining which reduction /// algorithm to use. /// /// 'shuffleReduceFn' retrieves data from the remote lane using /// efficient GPU shuffle intrinsics and reduces, using the /// algorithm specified by the 4th parameter, the two operands /// element-wise. The result is written to the first operand. /// /// Different reduction algorithms are implemented in different /// runtime functions, all calling 'shuffleReduceFn' to perform /// the essential reduction step. Therefore, based on the 4th /// parameter, this function behaves slightly differently to /// cooperate with the runtime to ensure correctness under /// different circumstances. /// /// 'InterWarpCpyFn' is a pointer to a function that transfers /// reduced variables across warps. It tunnels, through CUDA /// shared memory, the thread-private data of type 'ReduceData' /// from lane 0 of each warp to a lane in the first warp. /// 4. Call the OpenMP runtime on the GPU to reduce across teams. /// The last team writes the global reduced value to memory. /// /// ret = __kmpc_nvptx_teams_reduce_nowait(..., /// reduceData, shuffleReduceFn, interWarpCpyFn, /// scratchpadCopyFn, loadAndReduceFn) /// /// 'scratchpadCopyFn' is a helper that stores reduced /// data from the team master to a scratchpad array in /// global memory. /// /// 'loadAndReduceFn' is a helper that loads data from /// the scratchpad array and reduces it with the input /// operand. /// /// These compiler generated functions hide address /// calculation and alignment information from the runtime. /// 5. if ret == 1: /// The team master of the last team stores the reduced /// result to the globals in memory. /// foo += reduceData.foo; bar *= reduceData.bar /// /// /// Warp Reduction Algorithms /// /// On the warp level, we have three algorithms implemented in the /// OpenMP runtime depending on the number of active lanes: /// /// Full Warp Reduction /// /// The reduce algorithm within a warp where all lanes are active /// is implemented in the runtime as follows: /// /// full_warp_reduce(void *reduce_data, /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) /// ShuffleReduceFn(reduce_data, 0, offset, 0); /// } /// /// The algorithm completes in log(2, WARPSIZE) steps. /// /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is /// not used therefore we save instructions by not retrieving lane_id /// from the corresponding special registers. The 4th parameter, which /// represents the version of the algorithm being used, is set to 0 to /// signify full warp reduction. /// /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: /// /// #reduce_elem refers to an element in the local lane's data structure /// #remote_elem is retrieved from a remote lane /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); /// reduce_elem = reduce_elem REDUCE_OP remote_elem; /// /// Contiguous Partial Warp Reduction /// /// This reduce algorithm is used within a warp where only the first /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the /// number of OpenMP threads in a parallel region is not a multiple of /// WARPSIZE. The algorithm is implemented in the runtime as follows: /// /// void /// contiguous_partial_reduce(void *reduce_data, /// kmp_ShuffleReductFctPtr ShuffleReduceFn, /// int size, int lane_id) { /// int curr_size; /// int offset; /// curr_size = size; /// mask = curr_size/2; /// while (offset>0) { /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); /// curr_size = (curr_size+1)/2; /// offset = curr_size/2; /// } /// } /// /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: /// /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); /// if (lane_id < offset) /// reduce_elem = reduce_elem REDUCE_OP remote_elem /// else /// reduce_elem = remote_elem /// /// This algorithm assumes that the data to be reduced are located in a /// contiguous subset of lanes starting from the first. When there is /// an odd number of active lanes, the data in the last lane is not /// aggregated with any other lane's dat but is instead copied over. /// /// Dispersed Partial Warp Reduction /// /// This algorithm is used within a warp when any discontiguous subset of /// lanes are active. It is used to implement the reduction operation /// across lanes in an OpenMP simd region or in a nested parallel region. /// /// void /// dispersed_partial_reduce(void *reduce_data, /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { /// int size, remote_id; /// int logical_lane_id = number_of_active_lanes_before_me() * 2; /// do { /// remote_id = next_active_lane_id_right_after_me(); /// # the above function returns 0 of no active lane /// # is present right after the current lane. /// size = number_of_active_lanes_in_this_warp(); /// logical_lane_id /= 2; /// ShuffleReduceFn(reduce_data, logical_lane_id, /// remote_id-1-threadIdx.x, 2); /// } while (logical_lane_id % 2 == 0 && size > 1); /// } /// /// There is no assumption made about the initial state of the reduction. /// Any number of lanes (>=1) could be active at any position. The reduction /// result is returned in the first active lane. /// /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: /// /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); /// if (lane_id % 2 == 0 && offset > 0) /// reduce_elem = reduce_elem REDUCE_OP remote_elem /// else /// reduce_elem = remote_elem /// /// /// Intra-Team Reduction /// /// This function, as implemented in the runtime call /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP /// threads in a team. It first reduces within a warp using the /// aforementioned algorithms. We then proceed to gather all such /// reduced values at the first warp. /// /// The runtime makes use of the function 'InterWarpCpyFn', which copies /// data from each of the "warp master" (zeroth lane of each warp, where /// warp-reduced data is held) to the zeroth warp. This step reduces (in /// a mathematical sense) the problem of reduction across warp masters in /// a block to the problem of warp reduction. /// /// /// Inter-Team Reduction /// /// Once a team has reduced its data to a single value, it is stored in /// a global scratchpad array. Since each team has a distinct slot, this /// can be done without locking. /// /// The last team to write to the scratchpad array proceeds to reduce the /// scratchpad array. One or more workers in the last team use the helper /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., /// the k'th worker reduces every k'th element. /// /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to /// reduce across workers and compute a globally reduced value. /// void CGOpenMPRuntimeGPU::emitReduction( CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { … } const VarDecl * CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const { … } Address CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const { … } void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef<llvm::Value *> Args) const { … } /// Emit function which wraps the outline parallel region /// and controls the arguments which are passed to this function. /// The wrapper ensures that the outlined function is called /// with the correct arguments when data is shared. llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { … } void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { … } Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { … } void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { … } void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { … } void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const { … } void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( CodeGenFunction &CGF, const OMPExecutableDirective &D) const { … } bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) { … } // Get current OffloadArch and ignore any unknown values static OffloadArch getOffloadArch(CodeGenModule &CGM) { … } /// Check to see if target architecture supports unified addressing which is /// a restriction for OpenMP requires clause "unified_shared_memory". void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { … } llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { … } llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { … }