SparseVectorization.cpp | Explore in Territory

//===- SparseVectorization.cpp - Vectorization of sparsified loops --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A pass that converts loops generated by the sparsifier into a form that
// can exploit SIMD instructions of the target architecture. Note that this pass
// ensures the sparsifier can generate efficient SIMD (including ArmSVE
// support) with proper separation of concerns as far as sparsification and
// vectorization is concerned. However, this pass is not the final abstraction
// level we want, and not the general vectorizer we want either. It forms a good
// stepping stone for incremental future improvements though.
//
//===----------------------------------------------------------------------===//

#include "Utils/CodegenUtils.h"
#include "Utils/LoopEmitter.h"

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Complex/IR/Complex.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Matchers.h"

usingnamespacemlir;
usingnamespacemlir::sparse_tensor;

namespace {

/// Target SIMD properties:
///   vectorLength: # packed data elements (viz. vector<16xf32> has length 16)
///   enableVLAVectorization: enables scalable vectors (viz. ARMSve)
///   enableSIMDIndex32: uses 32-bit indices in gather/scatter for efficiency
struct VL { … };

/// Helper test for invariant value (defined outside given block).
static bool isInvariantValue(Value val, Block *block) { … }

/// Helper test for invariant argument (defined outside given block).
static bool isInvariantArg(BlockArgument arg, Block *block) { … }

/// Constructs vector type for element type.
static VectorType vectorType(VL vl, Type etp) { … }

/// Constructs vector type from a memref value.
static VectorType vectorType(VL vl, Value mem) { … }

/// Constructs vector iteration mask.
static Value genVectorMask(PatternRewriter &rewriter, Location loc, VL vl,
                           Value iv, Value lo, Value hi, Value step) { … }

/// Generates a vectorized invariant. Here we rely on subsequent loop
/// optimizations to hoist the invariant broadcast out of the vector loop.
static Value genVectorInvariantValue(PatternRewriter &rewriter, VL vl,
                                     Value val) { … }

/// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi],
/// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note
/// that the sparsifier can only generate indirect loads in
/// the last index, i.e. back().
static Value genVectorLoad(PatternRewriter &rewriter, Location loc, VL vl,
                           Value mem, ArrayRef<Value> idxs, Value vmask) { … }

/// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs
/// where 'lo' denotes the current index and 'hi = lo + vl - 1'. Note
/// that the sparsifier can only generate indirect stores in
/// the last index, i.e. back().
static void genVectorStore(PatternRewriter &rewriter, Location loc, Value mem,
                           ArrayRef<Value> idxs, Value vmask, Value rhs) { … }

/// Detects a vectorizable reduction operations and returns the
/// combining kind of reduction on success in `kind`.
static bool isVectorizableReduction(Value red, Value iter,
                                    vector::CombiningKind &kind) { … }

/// Generates an initial value for a vector reduction, following the scheme
/// given in Chapter 5 of "The Software Vectorization Handbook", where the
/// initial scalar value is correctly embedded in the vector reduction value,
/// and a straightforward horizontal reduction will complete the operation.
/// Value 'r' denotes the initial value of the reduction outside the loop.
static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
                                Value red, Value iter, Value r,
                                VectorType vtp) { … }

/// This method is called twice to analyze and rewrite the given subscripts.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) yields the proper vector form in the output parameter
/// vector 'idxs'. This mechanism ensures that analysis and rewriting code
/// stay in sync. Note that the analyis part is simple because the sparsifier
/// only generates relatively simple subscript expressions.
///
/// See https://llvm.org/docs/GetElementPtr.html for some background on
/// the complications described below.
///
/// We need to generate a position/coordinate load from the sparse storage
/// scheme.  Narrower data types need to be zero extended before casting
/// the value into the `index` type used for looping and indexing.
///
/// For the scalar case, subscripts simply zero extend narrower indices
/// into 64-bit values before casting to an index type without a performance
/// penalty. Indices that already are 64-bit, in theory, cannot express the
/// full range since the LLVM backend defines addressing in terms of an
/// unsigned pointer/signed index pair.
static bool vectorizeSubscripts(PatternRewriter &rewriter, scf::ForOp forOp,
                                VL vl, ValueRange subs, bool codegen,
                                Value vmask, SmallVectorImpl<Value> &idxs) { … }

#define UNAOP …

#define TYPEDUNAOP …

#define BINOP …

/// This method is called twice to analyze and rewrite the given expression.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) yields the proper vector form in the output parameter 'vexp'.
/// This mechanism ensures that analysis and rewriting code stay in sync. Note
/// that the analyis part is simple because the sparsifier only generates
/// relatively simple expressions inside the for-loops.
static bool vectorizeExpr(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
                          Value exp, bool codegen, Value vmask, Value &vexp) { … }

#undef UNAOP
#undef TYPEDUNAOP
#undef BINOP

/// This method is called twice to analyze and rewrite the given for-loop.
/// The first call (!codegen) does the analysis. Then, on success, the second
/// call (codegen) rewriters the IR into vector form. This mechanism ensures
/// that analysis and rewriting code stay in sync.
static bool vectorizeStmt(PatternRewriter &rewriter, scf::ForOp forOp, VL vl,
                          bool codegen) { … }

/// Basic for-loop vectorizer.
struct ForOpRewriter : public OpRewritePattern<scf::ForOp> { … };

/// Reduction chain cleanup.
///   v = for { }
///   s = vsum(v)               v = for { }
///   u = expand(s)       ->    for (v) { }
///   for (u) { }
template <typename VectorOp>
struct ReducChainRewriter : public OpRewritePattern<VectorOp> { … };

} // namespace

//===----------------------------------------------------------------------===//
// Public method for populating vectorization rules.
//===----------------------------------------------------------------------===//

/// Populates the given patterns list with vectorization rules.
void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns,
                                               unsigned vectorLength,
                                               bool enableVLAVectorization,
                                               bool enableSIMDIndex32) { … }
llvm/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp