tma_load_64x8_8x128_noswizzle.mlir | Explore in Territory

// RUN: mlir-opt %s \
// RUN:  -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90 cubin-features=+ptx80 opt-level=3" \
// RUN:  | mlir-cpu-runner \
// RUN:   --shared-libs=%mlir_cuda_runtime \
// RUN:   --shared-libs=%mlir_runner_utils \
// RUN:   --entry-point-result=void \
// RUN:  | FileCheck %s

// RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
// RUN:         -gpu-kernel-outlining \
// RUN:         -convert-nvvm-to-llvm \
// RUN:         -convert-scf-to-cf  \
// RUN:         -convert-vector-to-llvm \
// RUN:         -convert-index-to-llvm=index-bitwidth=32 \
// RUN:         -convert-arith-to-llvm \
// RUN:         -finalize-memref-to-llvm \
// RUN:         -convert-func-to-llvm \
// RUN:         -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \
// RUN:  | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \
// RUN:  | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \
// RUN:  | mlir-cpu-runner \
// RUN:   --shared-libs=%mlir_cuda_runtime \
// RUN:   --shared-libs=%mlir_runner_utils \
// RUN:   --entry-point-result=void \
// RUN:  | FileCheck %s


// CHECK: [GPU] TMA BEFORE lhs[45][7] 0.000000
// CHECK: [GPU] TMA BEFORE rhs[7][0] 0.000000
// CHECK: [GPU] TMA LOADED lhs[45][7] 7.000000
// CHECK: [GPU] TMA LOADED rhs[7][0] 3.000000

module @mymod {
  memref.global "private" @bufferLhsGlobal : memref<64x8xf32, 3>
  memref.global "private" @bufferRhsGlobal : memref<8x128xf32, 3>
  func.func @main() {
    %c10000000 = arith.constant 10000000 : index
    %c6144 = arith.constant 6144 : index
    %c45 = arith.constant 45 : index
    %c7 = arith.constant 7 : index
    %c64 = arith.constant 64 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %c128 = arith.constant 128 : index
    %cst = arith.constant 3.000000e+00 : f32
    %alloc = memref.alloc() : memref<64x8xf32>
    %alloc_0 = memref.alloc() : memref<8x128xf32>
    scf.for %arg0 = %c0 to %c8 step %c1 {
      scf.for %arg1 = %c0 to %c128 step %c1 {
        memref.store %cst, %alloc_0[%arg0, %arg1] : memref<8x128xf32>
      }
    }
    scf.for %arg0 = %c0 to %c64 step %c1 {
      scf.for %arg1 = %c0 to %c8 step %c1 {
        %5 = arith.index_cast %arg1 : index to i64
        %6 = arith.uitofp %5 : i64 to f32
        memref.store %6, %alloc[%arg0, %arg1] : memref<64x8xf32>
      }
    }
    %0 = gpu.wait async
    %memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32>
    %memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32>
    %1 = gpu.memcpy async [%0] %memref, %alloc : memref<64x8xf32>, memref<64x8xf32>
    %2 = gpu.memcpy async [%0] %memref_1, %alloc_0 : memref<8x128xf32>, memref<8x128xf32>
    %cast = memref.cast %memref : memref<64x8xf32> to memref<*xf32>
    %cast_3 = memref.cast %memref_1 : memref<8x128xf32> to memref<*xf32>
    %3 = nvgpu.tma.create.descriptor %cast box[%c64, %c8] : memref<*xf32> -> <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
    %4 = nvgpu.tma.create.descriptor %cast_3 box[%c8, %c128] : memref<*xf32> -> <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
    gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c128, %arg10 = %c1, %arg11 = %c1) {
      %5 = gpu.block_dim  x
      %6 = gpu.thread_id  x
      %7 = memref.get_global @bufferLhsGlobal : memref<64x8xf32, 3>
      %8 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, 3>
      %9 = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>>
      nvgpu.mbarrier.init %9[%c0], %5 : <memorySpace = #gpu.address_space<workgroup>>
      gpu.barrier
      %10 = arith.cmpi eq, %6, %c0 : index
      scf.if %10 {
        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c6144 : <memorySpace = #gpu.address_space<workgroup>>
        %11 = memref.load %7[%c0, %c0] : memref<64x8xf32, 3>
        %12 = memref.load %8[%c0, %c0] : memref<8x128xf32, 3>
        gpu.printf "[GPU] TMA BEFORE lhs[45][7] %f\0A" %11 : f32
        gpu.printf "[GPU] TMA BEFORE rhs[7][0] %f\0A" %12 : f32
        nvgpu.tma.async.load %3[%c0, %c0], %9[%c0] to %7 : <tensor = memref<64x8xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<64x8xf32, 3>
        nvgpu.tma.async.load %4[%c0, %c0], %9[%c0] to %8 : <tensor = memref<8x128xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>> -> memref<8x128xf32, 3>
      } else {
        nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : <memorySpace = #gpu.address_space<workgroup>>
      }
      %phase_c0 = arith.constant 0 : i1
      nvgpu.mbarrier.try_wait.parity %9[%c0], %phase_c0, %c10000000 : <memorySpace = #gpu.address_space<workgroup>>
      scf.if %10 {
        %11 = memref.load %7[%c45, %c7] : memref<64x8xf32, 3>
        %12 = memref.load %8[%c7, %c0] : memref<8x128xf32, 3>
        gpu.printf "[GPU] TMA LOADED lhs[45][7] %f\0A" %11 : f32
        gpu.printf "[GPU] TMA LOADED rhs[7][0] %f\0A" %12 : f32
      }
      gpu.terminator
    }
    return
  }
}
llvm/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir