llvm/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h

//===- Utils.h - Utils for GPU transform ops --------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H
#define MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H

#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"

namespace mlir {
namespace gpu {
class GPUOp;
class LaunchOp;
enum class MappingId : uint64_t;
} // namespace gpu
namespace scf {
class ForallOp;
} // namespace scf
namespace transform {
namespace gpu {

/// Helper type for functions that generate ids for the mapping of a scf.forall.
/// Operates on both 1) an "original" basis that represents the individual
/// thread and block ids and 2) a "scaled" basis that represents grouped ids
/// (e.g. block clusters, warpgroups and warps).
/// The mapping of ids is done in the "scaled" basis (i.e. when mapping to warps
/// a division by 32 occurs).
/// The predication is in the "original" basis using the "active" quantities
/// (`activeMappingSizes`, `availableMappingSizes` and `activeIdOps`).
struct IdBuilderResult {};

/// Common gpu id builder type, allows the configuration of lowering for various
/// mapping schemes. Takes:
///   - A rewriter with insertion point set before the forall op to rewrite.
///   - The loc of the forall op to rewrite.
///   - A list of positive integers carrying the mapping sizes for the current
///     forall op to rewrite.
GpuIdBuilderFnType;

/// Helper struct for configuring the rewrite of mapped scf.forall ops to
/// various gpu id configurations.
struct GpuIdBuilder {};

/// Builder for gpu::BlockIdOps used to map scf.forall to blocks.
/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
/// used for indexing rewrites as well as 3D sizes for predicate generation.
/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
/// used for indexing rewrites as well as 1D sizes for predicate generation.
struct GpuBlockIdBuilder : public GpuIdBuilder {};

/// Builder for warpgroup ids used to map scf.forall to reindexed warpgroups.
/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
/// used for indexing rewrites as well as 3D sizes for predicate generation.
/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
/// used for indexing rewrites as well as 1D sizes for predicate generation.
struct GpuWarpgroupIdBuilder : public GpuIdBuilder {};

/// Builder for warp ids used to map scf.forall to reindexed warps.
/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
/// used for indexing rewrites as well as 3D sizes for predicate generation.
/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
/// used for indexing rewrites as well as 1D sizes for predicate generation.
struct GpuWarpIdBuilder : public GpuIdBuilder {};

/// Builder for warp ids used to map scf.forall to reindexed threads.
/// If `useLinearMapping` is false, the `idBuilder` method returns 3D values
/// used for indexing rewrites as well as 3D sizes for predicate generation.
/// If `useLinearMapping` is true, the `idBuilder` method returns nD values
/// used for indexing rewrites as well as 1D sizes for predicate generation.
struct GpuThreadIdBuilder : public GpuIdBuilder {};

/// Determine if the size of the kernel configuration is supported by the
/// GPU architecture being used.
/// TODO this is currently hardwired to CUDA, parameterize and generalize.
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
                                           std::optional<int64_t> gridDimX,
                                           std::optional<int64_t> gridDimY,
                                           std::optional<int64_t> gridDimZ,
                                           std::optional<int64_t> blockDimX,
                                           std::optional<int64_t> blockDimY,
                                           std::optional<int64_t> blockDimZ);

/// Create an empty-body gpu::LaunchOp using the provided kernel settings
/// and put a terminator within.
DiagnosedSilenceableFailure
createGpuLaunch(RewriterBase &rewriter, Location loc,
                TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp,
                std::optional<int64_t> gridDimX = std::nullopt,
                std::optional<int64_t> gridDimY = std::nullopt,
                std::optional<int64_t> gridDimZ = std::nullopt,
                std::optional<int64_t> blockDimX = std::nullopt,
                std::optional<int64_t> blockDimY = std::nullopt,
                std::optional<int64_t> blockDimZ = std::nullopt);

/// Alter kernel configuration of the given kernel.
DiagnosedSilenceableFailure
alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch,
               TransformOpInterface transformOp,
               std::optional<int64_t> gridDimX = std::nullopt,
               std::optional<int64_t> gridDimY = std::nullopt,
               std::optional<int64_t> gridDimZ = std::nullopt,
               std::optional<int64_t> blockDimX = std::nullopt,
               std::optional<int64_t> blockDimY = std::nullopt,
               std::optional<int64_t> blockDimZ = std::nullopt);

/// Find the unique top level scf::ForallOp within a given target op.
DiagnosedSilenceableFailure
findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
                     TransformOpInterface transformOp);

} // namespace gpu
} // namespace transform
} // namespace mlir

#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_UTILS_H