//===- SparseVectorization.cpp - Vectorization of sparsified loops --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // A pass that converts loops generated by the sparsifier into a form that // can exploit SIMD instructions of the target architecture. Note that this pass // ensures the sparsifier can generate efficient SIMD (including ArmSVE // support) with proper separation of concerns as far as sparsification and // vectorization is concerned. However, this pass is not the final abstraction // level we want, and not the general vectorizer we want either. It forms a good // stepping stone for incremental future improvements though. // //===----------------------------------------------------------------------===// #include "Utils/CodegenUtils.h" #include "Utils/LoopEmitter.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/Matchers.h" usingnamespacemlir; usingnamespacemlir::sparse_tensor; namespace { /// Target SIMD properties: /// vectorLength: # packed data elements (viz. vector<16xf32> has length 16) /// enableVLAVectorization: enables scalable vectors (viz. ARMSve) /// enableSIMDIndex32: uses 32-bit indices in gather/scatter for efficiency struct VL { … }; /// Helper test for invariant value (defined outside given block). static bool isInvariantValue(Value val, Block *block) { … } /// Helper test for invariant argument (defined outside given block). static bool isInvariantArg(BlockArgument arg, Block *block) { … } /// Constructs vector type for element type. static VectorType vectorType(VL vl, Type etp) { … } /// Constructs vector type from a memref value. static VectorType vectorType(VL vl, Value mem) { … } /// Constructs vector iteration mask. static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl, Value iv, Value lo, Value hi, Value step) { … } /// Generates a vectorized invariant. Here we rely on subsequent loop /// optimizations to hoist the invariant broadcast out of the vector loop. static Value genVectorInvariantValue(PatternRewriter &rewriter, VL vl, Value val) { … } /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi], /// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note /// that the sparsifier can only generate indirect loads in /// the last index, i.e. back(). static Value genVectorLoad(PatternRewriter &rewriter, Location loc, VL vl, Value mem, ArrayRef<Value> idxs, Value vmask) { … } /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs /// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note /// that the sparsifier can only generate indirect stores in /// the last index, i.e. back(). static void genVectorStore(PatternRewriter &rewriter, Location loc, Value mem, ArrayRef<Value> idxs, Value vmask, Value rhs) { … } /// Detects a vectorizable reduction operations and returns the /// combining kind of reduction on success in `kind`. static bool isVectorizableReduction(Value red, Value iter, vector::CombiningKind &kind) { … } /// Generates an initial value for a vector reduction, following the scheme /// given in Chapter 5 of "The Software Vectorization Handbook", where the /// initial scalar value is correctly embedded in the vector reduction value, /// and a straightforward horizontal reduction will complete the operation. /// Value 'r' denotes the initial value of the reduction outside the loop. static Value genVectorReducInit(PatternRewriter &rewriter, Location loc, Value red, Value iter, Value r, VectorType vtp) { … } /// This method is called twice to analyze and rewrite the given subscripts. /// The first call (!codegen) does the analysis. Then, on success, the second /// call (codegen) yields the proper vector form in the output parameter /// vector 'idxs'. This mechanism ensures that analysis and rewriting code /// stay in sync. Note that the analyis part is simple because the sparsifier /// only generates relatively simple subscript expressions. /// /// See https://llvm.org/docs/GetElementPtr.html for some background on /// the complications described below. /// /// We need to generate a position/coordinate load from the sparse storage /// scheme. Narrower data types need to be zero extended before casting /// the value into the `index` type used for looping and indexing. /// /// For the scalar case, subscripts simply zero extend narrower indices /// into 64-bit values before casting to an index type without a performance /// penalty. Indices that already are 64-bit, in theory, cannot express the /// full range since the LLVM backend defines addressing in terms of an /// unsigned pointer/signed index pair. static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, ValueRange subs, bool codegen, Value vmask, SmallVectorImpl<Value> &idxs) { … } #define UNAOP … #define TYPEDUNAOP … #define BINOP … /// This method is called twice to analyze and rewrite the given expression. /// The first call (!codegen) does the analysis. Then, on success, the second /// call (codegen) yields the proper vector form in the output parameter 'vexp'. /// This mechanism ensures that analysis and rewriting code stay in sync. Note /// that the analyis part is simple because the sparsifier only generates /// relatively simple expressions inside the for-loops. static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, Value exp, bool codegen, Value vmask, Value &vexp) { … } #undef UNAOP #undef TYPEDUNAOP #undef BINOP /// This method is called twice to analyze and rewrite the given for-loop. /// The first call (!codegen) does the analysis. Then, on success, the second /// call (codegen) rewriters the IR into vector form. This mechanism ensures /// that analysis and rewriting code stay in sync. static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl, bool codegen) { … } /// Basic for-loop vectorizer. struct ForOpRewriter : public OpRewritePattern<scf::ForOp> { … }; /// Reduction chain cleanup. /// v = for { } /// s = vsum(v) v = for { } /// u = expand(s) -> for (v) { } /// for (u) { } template <typename VectorOp> struct ReducChainRewriter : public OpRewritePattern<VectorOp> { … }; } // namespace //===----------------------------------------------------------------------===// // Public method for populating vectorization rules. //===----------------------------------------------------------------------===// /// Populates the given patterns list with vectorization rules. void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns, unsigned vectorLength, bool enableVLAVectorization, bool enableSIMDIndex32) { … }