llvm/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td

//===- NVGPUTransformOps.td - NVGPU transform ops ----------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef NVGPU_TRANSFORM_OPS
#define NVGPU_TRANSFORM_OPS

include "mlir/Dialect/Transform/IR/TransformAttrs.td"
include "mlir/Dialect/Transform/IR/TransformDialect.td"
include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
include "mlir/Dialect/Transform/IR/TransformTypes.td"
include "mlir/Interfaces/SideEffectInterfaces.td"

//===----------------------------------------------------------------------===//
// Apply...ConversionPatternsOp
//===----------------------------------------------------------------------===//

def ApplyNVGPUToNVVMConversionPatternsOp : Op<Transform_Dialect,
    "apply_conversion_patterns.nvgpu.nvgpu_to_nvvm",
    [DeclareOpInterfaceMethods<ConversionPatternDescriptorOpInterface,
                               ["verifyTypeConverter"]>]> {
  let description = [{
    Collects patterns that convert NVGPU dialect ops to NVVM dialect ops. These
    patterns require an "LLVMTypeConverter".
  }];
  let assemblyFormat = "attr-dict";
}

//===----------------------------------------------------------------------===//
// CreateAsyncGroupsOp
//===----------------------------------------------------------------------===//

def CreateAsyncGroupsOp :
  Op<Transform_Dialect, "nvgpu.create_async_groups",
    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
     TransformEachOpTrait,
     TransformOpInterface,
     ReportTrackingListenerFailuresOpTrait]> {
  let description = [{
    Look for global to shared memory copies within the targeted op in the form
    of vector transfer ops and convert them to async copies when possible.
    Consecutive copies are put into the same group. A "wait" operation is
    inserted right at the of end the group.

    `bypass_l1` specifies whether `bypassL1` attributes should be added to
    the async copies. `bypass_l1` is a compiler hint: only 16 byte transfers
    can bypass the L1 cache, so this attribute is not set for any other transfer
    sizes.

    #### Return modes

    This op consumes the `target` handle and produces the `result` handle, which
    is mapped to the same payload operations as the `target` handle. The op
    modifies the payload.
  }];

  let arguments = (ins TransformHandleTypeInterface:$target,
                   UnitAttr:$bypass_l1);
  let results = (outs TransformHandleTypeInterface:$result);

  let assemblyFormat = [{
    $target attr-dict `:` functional-type(operands, results)
  }];

  let extraClassDeclaration = [{
    ::mlir::DiagnosedSilenceableFailure applyToOne(
        ::mlir::transform::TransformRewriter &rewriter,
        ::mlir::Operation *target,
        ::mlir::transform::ApplyToEachResultList &results,
        ::mlir::transform::TransformState &state);
  }];
}

//===----------------------------------------------------------------------===//
// PipelineSharedMemoryCopiesOp
//===----------------------------------------------------------------------===//

def PipelineSharedMemoryCopiesOp :
  Op<Transform_Dialect, "nvgpu.pipeline_shared_memory_copies",
    [FunctionalStyleTransformOpTrait,
     MemoryEffectsOpInterface,
     TransformEachOpTrait,
     TransformOpInterface,
     ReportTrackingListenerFailuresOpTrait]> {
  let summary =
    "Applies software pipelining to a given loop with shared memory copies";

  let description = [{
    Applies software pipelining to a given scf.for loop. The pipelining
    strategy will look for a load into shared memory and pipeline it to overlap
    it with the rest of the loop.
    
    NOTE: It is user responsibility to ensure that there are no dependency
    between `depth` iterations of the loop by using multi-buffering. It is
    also user responsibility to ensure a sufficient amount of shared memory
    is allocated to cover eventual writes by `depth-1` speculative
    iterations.

    `depth` will indicate how many stages the software pipeline should have.
    `peel_epilogue` allows to force the epilogue to be peeled out instead of
    potentially using predicated operations for the epilogue phase.

    #### Return modes

    Consumes the operand handle and produces a result handle pointing to the
    loop, which may or may not have been pipelined. Produces a definite failure
    if the loop pipeliner mutated the IR before failing to pipeline, in
    particular if `peel_epilogue` is not set and the loop body doesn't support
    predication. If failure propagation mode is set to "propagate", produces a
    silenceable failure when pipelining preconditions, e.g., loop bound being
    static, are not met or when the loop wasn't pipelined because due to the
    lack of loads into shared memory. If the failure propagation mode is set
    to "suppress" (default), succeeds in these case and associates the result
    handle with the original loop.

    TODO: the shared memory part and behavior specific to NVGPU should be
    made orthogonal to pipelining so that `transform.loop.pipeline` becomes
    usable here.
  }];

  let arguments = (ins TransformHandleTypeInterface:$for_op,
                   I64Attr:$depth,
                   UnitAttr:$peel_epilogue,
                   DefaultValuedAttr<FailurePropagationMode,
                      "::mlir::transform::FailurePropagationMode::Suppress">
                     :$failure_propagation_mode);
  let results = (outs TransformHandleTypeInterface:$result);

  let assemblyFormat = [{ 
    `failures` `(` $failure_propagation_mode `)`
    $for_op
    attr-dict 
    `:` functional-type(operands, results)
  }];

  let extraClassDeclaration = [{
    ::mlir::DiagnosedSilenceableFailure applyToOne(
        ::mlir::transform::TransformRewriter &rewriter,
        ::mlir::scf::ForOp forOp,
        ::mlir::transform::ApplyToEachResultList &results,
        ::mlir::transform::TransformState &state);
  }];
}

//===----------------------------------------------------------------------===//
// RewriteMatmulAsMmaSyncOp
//===----------------------------------------------------------------------===//

def RewriteMatmulAsMmaSyncOp :
  Op<Transform_Dialect, "nvgpu.rewrite_matmul_as_mma_sync",
    [FunctionalStyleTransformOpTrait, 
     MemoryEffectsOpInterface,
     TransformEachOpTrait, 
     TransformOpInterface,
     ReportTrackingListenerFailuresOpTrait]> {
  let description = [{
    Rewrite a matmul operation on memref to an mma.sync operation on vectors.

    Memory copies with the required access patterns are automatically inserted.
    Operations that do not have a 1-1 mapping to mma.sync operations are left
    unchanged.
  }];

  let arguments = (ins TransformHandleTypeInterface:$target);
  let results = (outs);

  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) ";

  let extraClassDeclaration = [{
    ::mlir::DiagnosedSilenceableFailure applyToOne(
        ::mlir::transform::TransformRewriter &rewriter,
        ::mlir::linalg::LinalgOp linalgOp,
        ::mlir::transform::ApplyToEachResultList &results,
        ::mlir::transform::TransformState &state);
  }];
}

//===----------------------------------------------------------------------===//
// RewriteCopyAsTmaOp
//===----------------------------------------------------------------------===//

def RewriteCopyAsTmaOp :
  Op<Transform_Dialect, "nvgpu.rewrite_copy_as_tma",
    [FunctionalStyleTransformOpTrait,
     MemoryEffectsOpInterface,
     TransformEachOpTrait,
     TransformOpInterface,
     ReportTrackingListenerFailuresOpTrait]> {
  let description = [{
    Rewrite a copy operation on memref to tma operations that transit through
    shared memory.
  }];

  let arguments = (ins TransformHandleTypeInterface:$target);
  let results = (outs);

  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results) ";

  let extraClassDeclaration = [{
    ::mlir::DiagnosedSilenceableFailure apply(
        ::mlir::transform::TransformRewriter &rewriter,
        ::mlir::transform::TransformResults &transformResults,
        ::mlir::transform::TransformState &state);
  }];
}

#endif // NVGPU_TRANSFORM_OPS