GPUHeuristics.cpp | Explore in Territory

//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"

#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include <cmath>
#include <numeric>

usingnamespacemlir;

#define DEBUG_TYPE …
#define DBGS() …
#define LDBG(X) …

static Attribute linearId0(MLIRContext *ctx) { … }
static Attribute linearId1(MLIRContext *ctx) { … }
static Attribute linearId2(MLIRContext *ctx) { … }

transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
                                                 int totalNumThreads,
                                                 int64_t desiredBitAlignment,
                                                 ArrayRef<int64_t> copySizes,
                                                 bool favorPredication,
                                                 int64_t elementalBitwidth) { … }

int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
    int64_t desiredBitAlignment, int64_t numContiguousElements,
    int64_t elementalBitwidth) { … }

/// Get the list of all factors that divide `val`, not just the prime factors.
static SmallVector<int64_t> getFactors(int64_t val) { … }

static int64_t product(ArrayRef<int64_t> vals) { … }

/// Extract `result` from `sizes` with the following constraints:
///   1. sizes[i] % result[i] for all i
///   2. product_of_threadsPerDim <= maxNumThreads
///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
///      must be sizes[currentIndex].
/// This is used to greedily extract the maximum number of threads usable for
/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
/// ensuring coalesced access along the most minor dimension.
/// Return the number of threads used in the range:
///   threadsPerDim[currentIndex .. sizes.end()]
// The implementation uses a dynamic programming approach to greedily extract
// the best combination under the constraints.
// TODO: Implementation details can be improved but putting effort there is a
// tradeoffs: `sizes` is expected to be of small rank and contain small values.
static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
                                               int64_t currentIndex,
                                               int64_t maxNumThreads) { … }

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
                                                 ArrayRef<int64_t> sizes,
                                                 int64_t desiredVectorSize,
                                                 bool favorPredication) { … }

transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
    int64_t totalNumThreads, ArrayRef<int64_t> sizes,
    int64_t desiredVectorSize) { … }

void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const { … }
llvm/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp