//===- SCFToGPU.cpp - Convert an affine loop nest to a GPU kernel -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This implements a straightforward conversion of an loop nest into a GPU // kernel. The caller is expected to guarantee that the conversion is correct // or to further transform the kernel to ensure correctness. // //===----------------------------------------------------------------------===// #include "mlir/Conversion/SCFToGPU/SCFToGPU.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/ParallelLoopMapper.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/Builders.h" #include "mlir/IR/IRMapping.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/Passes.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/Sequence.h" #include "llvm/Support/Debug.h" #include <optional> #define DEBUG_TYPE … usingnamespacemlir; usingnamespacemlir::affine; usingnamespacemlir::scf; // Name of internal attribute to mark visited operations during conversion. // // NOTE: The conversion originally used the following legality criteria: // `!parallelOp->hasAttr(gpu::getMappingAttrName())` // But the provided pattern might reject some cases based on more detailed // analysis of the `mapping` attribute. // To avoid dialect conversion failure due to non-converted illegal operation // we use this extra Unit attribute as a marker, that the operation was checked // by the pattern and is should be considered as legal in the following legality // checks. The `finalizeParallelLoopToGPUConversion` function performs clean up // of this extra attributes ans is supposed to be called after the dialect // conversion. // // TODO: Implement a cleaner solution, factoring out the "matching" logic // from the pattern and its callees into a separate function that can be called // from both the pattern and the op legality check. static constexpr StringLiteral kVisitedAttrName = …; // Extract an indexed value from KernelDim3. static Value getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) { … } // Get the lower bound-related operands of a loop operation. static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) { … } // Get the upper bound-related operands of a loop operation. static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) { … } // Get a Value that corresponds to the loop step. If the step is an attribute, // materialize a corresponding constant using builder. static Value getOrCreateStep(AffineForOp forOp, OpBuilder &builder) { … } // Get a Value for the loop lower bound. If the value requires computation, // materialize the instructions using builder. static Value getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) { … } // Get a Value for the loop upper bound. If the value requires computation, // materialize the instructions using builder. static Value getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) { … } // Check the structure of the loop nest: // - there are enough loops to map to numDims; // - the loops are perfectly nested; // - the loop bounds can be computed above the outermost loop. // This roughly corresponds to the "matcher" part of the pattern-based // rewriting infrastructure. static LogicalResult checkAffineLoopNestMappableImpl(AffineForOp forOp, unsigned numDims) { … } static LogicalResult checkAffineLoopNestMappable(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims) { … } namespace { // Helper structure that holds common state of the loop to GPU kernel // conversion. struct AffineLoopToGpuConverter { … }; } // namespace // Collect ranges, bounds, steps and induction variables in preparation for // mapping a loop nest of depth "numLoops" rooted at "forOp" to a GPU kernel. // This may fail if the IR for computing loop bounds cannot be constructed, for // example if an affine loop uses semi-affine maps. Return the last loop to be // mapped on success, std::nullopt on failure. std::optional<AffineForOp> AffineLoopToGpuConverter::collectBounds(AffineForOp forOp, unsigned numLoops) { … } // Replace the rooted at "rootForOp" with a GPU launch operation. This expects // "innermostForOp" to point to the last loop to be transformed to the kernel, // and to have (numBlockDims + numThreadDims) perfectly nested loops between // "rootForOp" and "innermostForOp". void AffineLoopToGpuConverter::createLaunch(AffineForOp rootForOp, AffineForOp innermostForOp, unsigned numBlockDims, unsigned numThreadDims) { … } // Generic loop to GPU kernel conversion function. static LogicalResult convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims) { … } LogicalResult mlir::convertAffineLoopNestToGPULaunch(AffineForOp forOp, unsigned numBlockDims, unsigned numThreadDims) { … } namespace { struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> { … }; } // namespace /// Tries to derive a static upper bound from the defining operation of /// `upperBound`. static Value deriveStaticUpperBound(Value upperBound, PatternRewriter &rewriter) { … } static bool isMappedToProcessor(gpu::Processor processor) { … } static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { … } /// Modifies the current transformation state to capture the effect of the given /// `scf.parallel` operation on index substitutions and the operations to be /// inserted. /// Specifically, if a dimension of a parallel loop is mapped to a hardware id, /// this function will /// - compute the loop index based on the hardware id and affine map from the /// mapping and update `cloningMap` to substitute all uses. /// - derive a new upper bound for the hardware id and augment the provided /// `gpu.launch operation` accordingly. /// - if the upper bound is imprecise, insert a conditional in the `gpu.launch` /// and update the rewriter to insert into the conditional's body. /// If the dimension is mapped to sequential, /// - insert a for loop into the body and update the rewriter to insert into /// the for loop's body. /// - update the `cloningMap` to replace uses of the index with the index of /// the new for loop. /// In either case, /// - append the instructions from the loops body to worklist, in reverse order. /// To note the end of the current scope in case a loop or conditional was /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the /// worklist. This signals the processor of the worklist to pop the rewriter /// one scope-level up. static LogicalResult processParallelLoop( ParallelOp parallelOp, gpu::LaunchOp launchOp, IRMapping &cloningMap, SmallVectorImpl<Operation *> &worklist, DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) { … } /// Lower a `scf.parallel` operation into a corresponding `gpu.launch` /// operation. /// /// This essentially transforms a loop nest into a corresponding SIMT function. /// The conversion is driven by mapping annotations on the `scf.parallel` /// operations. The mapping is provided via a `DictionaryAttribute` named /// `mapping`, which has three entries: /// - processor: the hardware id to map to. 0-2 are block dimensions, 3-5 are /// thread dimensions and 6 is sequential. /// - map : An affine map that is used to pre-process hardware ids before /// substitution. /// - bound : An affine map that is used to compute the bound of the hardware /// id based on an upper bound of the number of iterations. /// If the `scf.parallel` contains nested `scf.parallel` operations, those /// need to be annotated, as well. Structurally, the transformation works by /// splicing all operations from nested `scf.parallel` operations into a single /// sequence. Indices mapped to hardware ids are substituted with those ids, /// wheras sequential mappings result in a sequential for-loop. To have more /// flexibility when mapping code to hardware ids, the transform supports two /// affine maps. The first `map` is used to compute the actual index for /// substitution from the hardware id. The second `bound` is used to compute the /// launch dimension for the hardware id from the number of iterations the /// mapped loop is performing. Note that the number of iterations might be /// imprecise if the corresponding loop-bounds are loop-dependent. In such case, /// the hardware id might iterate over additional indices. The transformation /// caters for this by predicating the created sequence of instructions on /// the actual loop bound. This only works if an static upper bound for the /// dynamic loop bound can be derived, currently via analyzing `affine.min` /// operations. LogicalResult ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const { … } void mlir::populateParallelLoopToGPUPatterns(RewritePatternSet &patterns) { … } void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) { … } void mlir::finalizeParallelLoopToGPUConversion(Operation *op) { … }