//===- SparseGPUCodegen.cpp - Generates GPU code --------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This is a prototype GPU codegenerator for the sparsifier. // The objective is to eventually use the right combination of // direct code generation and libary calls into vendor-specific // highly optimized sparse libraries (e.g. cuSparse for CUDA). // //===----------------------------------------------------------------------===// #include "Utils/CodegenUtils.h" #include "Utils/LoopEmitter.h" #include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h" #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Matchers.h" usingnamespacemlir; usingnamespacemlir::sparse_tensor; namespace { // Sparse formats supported by cuSparse. enum class CuSparseFormat { … }; //===----------------------------------------------------------------------===// // Helper methods. //===----------------------------------------------------------------------===// /// Marks the given top module as a GPU container module. static void markAsGPUContainer(ModuleOp topModule) { … } /// Constructs a new GPU module (for GPU kernels) inside the given top module, /// or returns an existing GPU module if one was built previously. static gpu::GPUModuleOp genGPUModule(OpBuilder &builder, ModuleOp topModule) { … } /// Constructs a new GPU kernel in the given GPU module. static gpu::GPUFuncOp genGPUFunc(OpBuilder &builder, gpu::GPUModuleOp gpuModule, SmallVectorImpl<Value> &args) { … } /// Constructs code to launch GPU kernel. static Value genLaunchGPUFunc(OpBuilder &builder, gpu::GPUFuncOp gpuFunc, SmallVectorImpl<Value> &args, SmallVectorImpl<Value> &tokens, unsigned numThreads) { … } /// Maps the provided ranked host buffer into the device address space. /// Writes from the host are guaranteed to be visible to device kernels /// that are launched afterwards. Writes from the device are guaranteed /// to be visible on the host after synchronizing with the device kernel /// completion. Needs to cast the buffer to a unranked buffer. static Value genHostRegisterMemref(OpBuilder &builder, Location loc, Value mem) { … } /// Unmaps the provided buffer, expecting the casted buffer. static void genHostUnregisterMemref(OpBuilder &builder, Location loc, Value cast) { … } /// Generates first wait in an asynchronous chain. static Value genFirstWait(OpBuilder &builder, Location loc) { … } /// Generates last, blocking wait in an asynchronous chain. static void genBlockingWait(OpBuilder &builder, Location loc, ValueRange operands) { … } /// Allocates memory on the device. /// TODO: A `host_shared` attribute could be used to indicate that /// the buffer is visible by both host and device, but lowering /// that feature does not seem to be fully supported yet. static gpu::AllocOp genAllocMemRef(OpBuilder &builder, Location loc, Value mem, Value token) { … } // Allocates a typed buffer on the host with given size. static Value genHostBuffer(OpBuilder &builder, Location loc, Type type, Value size) { … } // Allocates a typed buffer on the device with given size. static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Type type, Value size, Value token) { … } // Allocates a void buffer on the device with given size. static gpu::AllocOp genAllocBuffer(OpBuilder &builder, Location loc, Value size, Value token) { … } /// Deallocates memory from the device. static Value genDeallocMemRef(OpBuilder &builder, Location loc, Value mem, Value token) { … } /// Copies memory between host and device (direction is implicit). static Value genCopyMemRef(OpBuilder &builder, Location loc, Value dst, Value src, Value token) { … } /// Generates an alloc/copy pair. static Value genAllocCopy(OpBuilder &builder, Location loc, Value b, SmallVectorImpl<Value> &tokens) { … } /// Generates a memref from tensor operation. static Value genTensorToMemref(PatternRewriter &rewriter, Location loc, Value tensor) { … } /// Prepares the outlined arguments, passing scalars and buffers in. Here we /// assume that the first buffer is the one allocated for output. We create /// a set of properly chained asynchronous allocation/copy pairs to increase /// overlap before launching the kernel. static Value genParametersIn(OpBuilder &builder, Location loc, SmallVectorImpl<Value> &scalars, SmallVectorImpl<Value> &buffers, SmallVectorImpl<Value> &args, SmallVectorImpl<Value> &tokens, bool useHostRegistrationForOut) { … } /// Finalizes the outlined arguments. The output buffer is copied depending /// on the kernel token and then deallocated. All other buffers are simply /// deallocated. Then we wait for all operations to complete. static void genParametersOut(OpBuilder &builder, Location loc, Value out, Value kernelToken, SmallVectorImpl<Value> &scalars, SmallVectorImpl<Value> &buffers, SmallVectorImpl<Value> &args, SmallVectorImpl<Value> &tokens) { … } /// Constructs code for new GPU kernel. static void genGPUCode(PatternRewriter &rewriter, gpu::GPUFuncOp gpuFunc, scf::ParallelOp forallOp, SmallVectorImpl<Value> &constants, SmallVectorImpl<Value> &scalars, SmallVectorImpl<Value> &buffers) { … } //===----------------------------------------------------------------------===// // Library helper methods. //===----------------------------------------------------------------------===// /// Helper to detect a + b with arguments taken from given block. static bool matchAddOfArgs(Block *block, Value val) { … } /// Helper to detect a * b with arguments taken from given block. static bool matchMulOfArgs(Block *block, Value val) { … } /// Helper to detect x = x + a * b static bool matchSumOfMultOfArgs(linalg::GenericOp op) { … } // Helper to detect c += spy(s) x (a * b) static bool matchSumReductionOfMulUnary(linalg::GenericOp op) { … } /// Test for dense tensor. static bool isDenseTensor(Value v) { … } /// Test for suitable positions/coordinates width. static bool isAdmissibleMetaData(SparseTensorType &aTp) { … } /// Test for sorted COO matrix with suitable metadata. static bool isAdmissibleCOO(SparseTensorType &aTp) { … } /// Test for CSR matrix with suitable metadata. static bool isAdmissibleCSR(SparseTensorType &aTp) { … } /// Test for CSC matrix with suitable metadata. static bool isAdmissibleCSC(SparseTensorType &aTp) { … } /// Test for BSR matrix with suitable metadata. static bool isAdmissibleBSR(SparseTensorType &aTp) { … } /// Test for 2:4 matrix with suitable metadata. static bool isAdmissible24(SparseTensorType &aTp) { … } /// Test for conversion into 2:4 matrix. static bool isConversionInto24(Value v) { … } /// Returns a suitable sparse format for the operation and given operand /// types with cuSparse, or kNone if none is available. static CuSparseFormat getCuSparseFormat(SparseTensorType aTp, SparseTensorType bTp, SparseTensorType cTp, bool enableRT, bool isMatVec) { … } /// Generates the first positions/coordinates of a sparse matrix. static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a, CuSparseFormat format, bool enableRT) { … } /// Generates the second coordinates of a sparse matrix. static Value genSecondCrds(OpBuilder &builder, Location loc, Value a, CuSparseFormat format, bool enableRT) { … } /// Generates the sparse matrix handle. static Operation *genSpMat(OpBuilder &builder, Location loc, SparseTensorType &aTp, Type handleTp, Type tokenTp, Value token, Value sz1, Value sz2, Value nseA, Value rowA, Value colA, Value valA, CuSparseFormat format, bool enableRT) { … } /// Match and rewrite SpMV kernel. static LogicalResult rewriteSpMV(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { … } /// Match and rewrite SpMM kernel. static LogicalResult rewriteSpMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { … } // Match and rewrite SpGEMM kernel. static LogicalResult rewriteSpGEMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { … } // Match and rewrite 2:4 SpMM kernel. static LogicalResult rewrite2To4SpMM(PatternRewriter &rewriter, linalg::GenericOp op) { … } /// Match and rewrite SDDMM kernel. static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op, bool enableRT) { … } //===----------------------------------------------------------------------===// // Rewriting rules for direct code generation. //===----------------------------------------------------------------------===// /// Proof-of-concept rewriter. This rule generates a GPU implementation /// for each outermost forall loop generated by the sparsifier. /// TODO: right now works with parallelization-strategy=dense-outer-loop /// but give this its own flags in the future struct ForallRewriter : public OpRewritePattern<scf::ParallelOp> { … }; //===----------------------------------------------------------------------===// // Rewriting rules for library recognition and code generation. //===----------------------------------------------------------------------===// /// Proof-of-concept rewriter. This rule recognizes certain math kernels /// and replaces these with corresponding calls into a sparse library. struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> { … }; } // namespace //===----------------------------------------------------------------------===// // Public method for populating GPU rewriting rules. // // Currently two set of rewriting rules are made available. The first set // implements direct code generation, currently by means of convering the // outermost paralell loop into GPU threads. The second set implements // libary recognition of a set of sparse operations. Eventually, the right // combination of these two approaches has to be found. //===----------------------------------------------------------------------===// void mlir::populateSparseGPUCodegenPatterns(RewritePatternSet &patterns, unsigned numThreads) { … } void mlir::populateSparseGPULibgenPatterns(RewritePatternSet &patterns, bool enableRT) { … }