import numpy as np
from mlir import ir
from mlir.dialects import arith
from mlir.dialects import func
from mlir.dialects import gpu
from mlir.dialects import memref
from mlir.dialects import nvgpu
from mlir.dialects import nvvm
from mlir.dialects import llvm
from mlir.dialects import builtin
from mlir.dialects import scf
from mlir.dialects import vector
from mlir.extras import types as T
TMA_LAST_DIM_F16 = 64 # 128B flaot16
WARP_SIZE = 32
WARP_GROUP_SIZE = WARP_SIZE * 4
PRODUCER_REGISTER_SIZE = 40
CONSUMER_REGISTER_SIZE = 232
PRODUCER_PRIMARY_THREAD = 128
CONSUMER_PRIMARY_THREAD = 0
# C++ uses this value to understand whether it's dynamic or not.
MLIR_DYNAMIC = -9223372036854775808
DEBUG = False
class TmaDescriptorBuilder:
"""A class that builds a TMA descriptor."""
def __init__(self, swizzle, l2promo, oob, interleave, tma_box_shape, memref_ty):
self.swizzle = swizzle # mlir.nvgpu.TensorMapSwizzleKind
self.l2promo = l2promo # mlir.nvgpu.TensorMapL2PromoKind
self.oob = oob # mlir.nvgpu.TensorMapOOBKind
self.interleave = interleave # mlir.nvgpu.TensorMapInterleaveKind
self.tma_box_shape = tma_box_shape
self.memref_ty = memref_ty # MemRefType
@property
def tensormap_descriptor_ty(self):
"""Returns a tensormap descriptor type."""
tensorMemrefType = ir.MemRefType.get(
self.tma_box_shape,
self.memref_ty.element_type,
memory_space=ir.Attribute.parse("3"),
)
return nvgpu.TensorMapDescriptorType.get(
tensorMemrefType,
self.swizzle,
self.l2promo,
self.oob,
self.interleave,
)
def tma_descriptor_op(self, device_ptr):
"""Returns a tensormap descriptor op."""
tma_descriptor_ty = self.tensormap_descriptor_ty
device_unranked_memref = memref.CastOp(
ir.UnrankedMemRefType.get(
self.memref_ty.element_type, self.memref_ty.memory_space
),
device_ptr,
)
tma_descriptor_op = nvgpu.TmaCreateDescriptorOp(
tma_descriptor_ty, device_unranked_memref, map(c, self.tma_box_shape)
)
return tma_descriptor_op.result
def debug_print(fmt, *args, predicate=None, threadNumber=-1, forcePrint=False):
if not DEBUG and not forcePrint:
return
type_formats = []
for arg in args:
ty_format = None
if ir.IndexType.isinstance(arg.type):
ty_format = "%llu"
if ir.IntegerType.isinstance(arg.type):
width = ir.IntegerType(arg.type).width
if width == 64:
ty_format = "%llu"
elif width == 32:
ty_format = "%d"
elif width == 1:
ty_format = "%i"
if ir.F32Type.isinstance(arg.type):
ty_format = "%f"
if ty_format is None:
raise NotImplementedError(arg.type)
type_formats.append(ty_format)
if threadNumber != -1:
tidx = gpu.thread_id(gpu.Dimension.x)
predicate = arith.cmpi(arith.CmpIPredicate.eq, tidx, c(threadNumber))
scf.yield_([])
if_op = scf.IfOp(predicate)
with ir.InsertionPoint(if_op.then_block):
gpu.printf(fmt.format(*type_formats) + "\n", args)
scf.yield_([])
def get_type_size(ty):
if ir.FloatType.isinstance(ty):
return ir.FloatType(ty).width // 8
if ir.IntegerType.isinstance(ty):
return ir.IntegerType(ty).width // 8
raise NotImplementedError(ty)
def get_mlir_ty(dtype):
if dtype == np.float16:
return T.f16()
if dtype == np.float32:
return T.f32()
if dtype == np.float64:
return T.f64()
if dtype == np.int32:
return T.i32()
if dtype == np.int64:
return T.i64()
raise NotImplementedError(dtype)
def c(value, ty=None):
ty = T.index() if ty is None else ty
return arith.constant(ty, value)
def make_kernel_name(
input_type=np.float16,
output_type=np.float32,
M=4096,
N=4096,
K=4096,
BLOCK_M=128,
BLOCK_N=128,
BLOCK_K=128,
num_stages=3,
use_warp_specialization=False,
):
kernelName = "warpspecialized" if use_warp_specialization else "multistage"
return (
kernelName
+ "_"
+ str(M)
+ "x"
+ str(N)
+ "x"
+ str(K)
+ "_"
+ str(BLOCK_M)
+ "x"
+ str(BLOCK_N)
+ "x"
+ str(BLOCK_K)
+ "_"
+ str(num_stages)
)
def generate_matmul_ws(
input_type=np.float16,
output_type=np.float32,
M=4096,
N=4096,
K=4096,
BLOCK_M=128,
BLOCK_N=128,
BLOCK_K=128,
num_stages=3,
):
# Limitaitons for now
assert input_type == np.float16
assert output_type == np.float32
assert BLOCK_M == 128
assert BLOCK_N == 128
assert BLOCK_K == 64
assert M % BLOCK_M == 0
assert N % BLOCK_N == 0
assert K % BLOCK_K == 0
module = ir.Module.create()
token_ty = gpu.AsyncTokenType.get()
a_elem_ty = get_mlir_ty(input_type)
b_elem_ty = get_mlir_ty(input_type)
c_elem_ty = get_mlir_ty(output_type)
a_ty = ir.MemRefType.get([M, K], a_elem_ty)
b_ty = ir.MemRefType.get((K, N), b_elem_ty)
c_ty = ir.MemRefType.get((M, N), c_elem_ty)
a_tile_shape = a_tma_shape = (BLOCK_M, TMA_LAST_DIM_F16)
b_tma_shape = (BLOCK_K, TMA_LAST_DIM_F16)
b_tile_shape = (BLOCK_K, BLOCK_N)
txcount = (b_tile_shape[0] * b_tile_shape[1] * get_type_size(a_elem_ty)) + (
a_tile_shape[0] * a_tile_shape[1] * get_type_size(b_elem_ty)
)
smem_space_str = "#gpu.address_space<workgroup>"
smem_space = ir.Attribute.parse(smem_space_str)
mbar_ty = ir.Type.parse(
"!nvgpu.mbarrier.group<memorySpace = "
+ str(smem_space)
+ ", num_barriers = "
+ str(num_stages)
+ ">"
)
acc_ty = ir.Type.parse(
"!nvgpu.warpgroup.accumulator<fragmented=vector<"
+ str(BLOCK_M)
+ "x"
+ str(BLOCK_N)
+ "x"
+ str(c_elem_ty)
+ ">>"
)
a_wgmma_ty = ir.Type.parse(
"!nvgpu.warpgroup.descriptor<tensor=memref<"
+ str(BLOCK_M)
+ "x"
+ str(BLOCK_K)
+ "x"
+ str(a_elem_ty)
+ ", "
+ smem_space_str
+ ">>"
)
b_wgmma_ty = ir.Type.parse(
"!nvgpu.warpgroup.descriptor<tensor=memref<"
+ str(BLOCK_K)
+ "x"
+ str(BLOCK_N)
+ "x"
+ str(a_elem_ty)
+ ", "
+ smem_space_str
+ ">>"
)
kernelName = make_kernel_name(
input_type, output_type, M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, num_stages, True
)
with ir.InsertionPoint(module.body):
fop = func.FuncOp(kernelName, ([a_ty, b_ty, c_ty], []))
with ir.InsertionPoint(fop.add_entry_block()):
a_host = fop.arguments[0]
b_host = fop.arguments[1]
c_host = fop.arguments[2]
lhs_tile_bytes = BLOCK_M * BLOCK_K * get_type_size(a_elem_ty)
rhs_tile_bytes = BLOCK_N * BLOCK_K * get_type_size(b_elem_ty)
smem_size_input = (lhs_tile_bytes + rhs_tile_bytes) * num_stages
smem_size_output = BLOCK_M * BLOCK_N * get_type_size(c_elem_ty)
smem_size = max(smem_size_input, smem_size_output)
# Step 1. Allocate device memory and memcpy
t1 = gpu.wait(token_ty, [])
a_device, t2 = gpu.alloc(a_ty, token_ty, [t1], [], [])
b_device, t3 = gpu.alloc(b_ty, token_ty, [t2], [], [])
c_device, t4 = gpu.alloc(c_ty, token_ty, [t3], [], [])
t5 = gpu.memcpy(token_ty, [t4], a_device, a_host)
t6 = gpu.memcpy(token_ty, [t5], b_device, b_host)
t7 = gpu.wait(token_ty, [t6])
# Step 2. Create TMA Descriptors
a_tma_desc = TmaDescriptorBuilder(
nvgpu.TensorMapSwizzleKind.SWIZZLE_128B,
nvgpu.TensorMapL2PromoKind.L2PROMO_NONE,
nvgpu.TensorMapOOBKind.OOB_ZERO,
nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE,
a_tma_shape,
a_ty,
)
b_tma_desc = TmaDescriptorBuilder(
nvgpu.TensorMapSwizzleKind.SWIZZLE_128B,
nvgpu.TensorMapL2PromoKind.L2PROMO_NONE,
nvgpu.TensorMapOOBKind.OOB_ZERO,
nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE,
b_tma_shape,
b_ty,
)
a_tma_desc_op = a_tma_desc.tma_descriptor_op(a_device)
b_tma_desc_op = b_tma_desc.tma_descriptor_op(b_device)
# Step 3. Launch Kernel with 2 Warpgroups : 1 Producer, 1 Consumer
cta_m = M // BLOCK_M
cta_n = N // BLOCK_N
assert M % BLOCK_M == 0 and N % BLOCK_N == 0
grid = (cta_m, cta_n, 1)
block = (WARP_GROUP_SIZE * 2, 1, 1)
launch_op = gpu.LaunchOp(
token_ty,
[t7],
*map(c, grid),
*map(c, block),
dynamicSharedMemorySize=c(smem_size, ty=T.i32()),
)
launch_op.body.blocks.append(*([T.index()] * 12))
with ir.InsertionPoint(launch_op.body.blocks[0]):
# GPU Step 0. This is need for vectorized ld/st
memref.assume_alignment(c_device, 16)
dynamic_smem = gpu.dynamic_shared_memory(
ir.MemRefType.get((MLIR_DYNAMIC,), T.i8(), memory_space=smem_space)
)
ticks = c(10000000)
# GPU Step 1. Bootstrapping: find the primary thread, warps, warp groups and etc.
tidx = gpu.thread_id(gpu.Dimension.x)
wgPrimaryThread = arith.cmpi(
arith.CmpIPredicate.eq, arith.remui(tidx, c(WARP_GROUP_SIZE)), c(0)
)
warp_id = arith.divui(tidx, c(32))
warpgroup_id = arith.divui(warp_id, c(4))
is_producer = arith.cmpi(
arith.CmpIPredicate.eq,
warpgroup_id,
c(1 if PRODUCER_PRIMARY_THREAD == 128 else 0),
)
is_consumer = arith.cmpi(
arith.CmpIPredicate.eq,
warpgroup_id,
c(0 if CONSUMER_PRIMARY_THREAD == 0 else 1),
)
producerPrimaryThread = arith.cmpi(
arith.CmpIPredicate.eq, tidx, c(PRODUCER_PRIMARY_THREAD)
)
consumerPrimaryThread = arith.cmpi(
arith.CmpIPredicate.eq, tidx, c(CONSUMER_PRIMARY_THREAD)
)
bidx = gpu.block_id(gpu.Dimension.x)
bidy = gpu.block_id(gpu.Dimension.y)
dimX = arith.muli(bidx, c(BLOCK_M))
dimY = arith.muli(bidy, c(BLOCK_N))
# GPU Step 2. Initialize mbarrier groups
mbarTMA = nvgpu.mbarrier_create(mbar_ty)
mbarDONE = nvgpu.mbarrier_create(mbar_ty)
for i in range(num_stages):
nvgpu.mbarrier_init(mbarTMA, c(1), c(i), predicate=wgPrimaryThread)
nvgpu.mbarrier_init(mbarDONE, c(1), c(i), predicate=wgPrimaryThread)
gpu.barrier()
# GPU Step 3. Prefetch TMA descriptors
nvgpu.tma_prefetch_descriptor(a_tma_desc_op, predicate=wgPrimaryThread)
nvgpu.tma_prefetch_descriptor(b_tma_desc_op, predicate=wgPrimaryThread)
ns = num_stages if num_stages == 1 else num_stages - 1
# GPU Step 5. Producer Warpgroup (TMA Warpgroup)
with ir.InsertionPoint(scf.IfOp(is_producer).then_block):
# Step 5.1. Reduce register size
nvvm.setmaxregister(
PRODUCER_REGISTER_SIZE, nvvm.SetMaxRegisterAction.decrease
)
# Step 5.2. TMA Main Loop
for_op = scf.ForOp(
c(0), c(K // BLOCK_K), c(1), [arith.constant(T.bool(), 1)]
)
with ir.InsertionPoint(for_op.body):
phaseParity = for_op.inner_iter_args[0]
iv = for_op.induction_variable
stage = arith.remui(iv, c(num_stages))
# Step 5.2.1. Wait mbarDONE
debug_print(
"[prod] iv={} | mbarDONE[{}] try_wait phase={}",
iv,
stage,
phaseParity,
predicate=producerPrimaryThread,
)
nvgpu.MBarrierTryWaitParityOp(
mbarDONE, phaseParity, ticks, mbarId=stage
)
debug_print(
"[prod] iv={} | mbarDONE[{}] try_wait phase={} [done]",
iv,
stage,
phaseParity,
predicate=producerPrimaryThread,
)
p = arith.cmpi(arith.CmpIPredicate.eq, stage, c(num_stages - 1))
phaseParity = arith.select(
p,
arith.xori(phaseParity, arith.constant(T.bool(), 1)),
phaseParity,
)
# Step 5.2.2. Load TMA
a_offset = arith.muli(stage, c(lhs_tile_bytes))
a_tma_slice = memref.view(
ir.MemRefType.get(
a_tma_shape, a_elem_ty, memory_space=smem_space
),
dynamic_smem,
a_offset,
[],
)
b_offset = arith.addi(
arith.muli(stage, c(rhs_tile_bytes)),
c(lhs_tile_bytes * num_stages),
)
b_tma_slice_1 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset,
[],
)
b_offset2 = arith.addi(
b_offset,
c(BLOCK_K * TMA_LAST_DIM_F16 * get_type_size(b_elem_ty)),
)
b_tma_slice_2 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset2,
[],
)
debug_print(
"[prod] a_offset={} b_offset={} b_offset2={}",
a_offset,
b_offset,
b_offset2,
predicate=producerPrimaryThread,
)
coord = arith.muli(c(64), iv)
nvgpu.TmaAsyncLoadOp(
a_tma_slice,
mbarTMA,
a_tma_desc_op,
coordinates=[coord, dimX],
mbarId=stage,
predicate=producerPrimaryThread,
)
nvgpu.TmaAsyncLoadOp(
b_tma_slice_1,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY, coord],
mbarId=stage,
predicate=producerPrimaryThread,
)
dimY2 = arith.addi(dimY, c(64))
nvgpu.TmaAsyncLoadOp(
b_tma_slice_2,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY2, coord],
mbarId=stage,
predicate=producerPrimaryThread,
)
# Step 5.2.3. Arrive mbarTMA
debug_print(
"[prod] iv={} | mbarTMA[{}] arrive",
iv,
stage,
predicate=producerPrimaryThread,
)
nvgpu.mbarrier_arrive_expect_tx(
mbarTMA, c(txcount), stage, predicate=producerPrimaryThread
)
debug_print(
"[prod] iv={} | mbarTMA[{}] arrive [done]",
iv,
stage,
predicate=producerPrimaryThread,
)
scf.yield_([phaseParity])
scf.yield_([])
# GPU Step 6. Consumer Warpgroup (MMA Warpgroup)
if_op = scf.IfOp(is_consumer)
with ir.InsertionPoint(if_op.then_block):
# Step 6.1. Increase register size
nvvm.setmaxregister(
CONSUMER_REGISTER_SIZE, nvvm.SetMaxRegisterAction.increase
)
# GPU Step 6.2. Initialize MMA registers
acc = nvgpu.warpgroup_mma_init_accumulator(acc_ty)
# Step 6.3. MMA Main Loop
for_op = scf.ForOp(
c(0), c(K // BLOCK_K), c(1), [acc, arith.constant(T.bool(), 0)]
)
with ir.InsertionPoint(for_op.body):
# Step 6.3.1. Wait mbar1
phaseParity = for_op.inner_iter_args[1]
iv = for_op.induction_variable
stage = arith.remui(iv, c(num_stages))
debug_print(
"[cons] iv={} | mbarTMA[{}] try_wait phase={}",
iv,
stage,
phaseParity,
predicate=consumerPrimaryThread,
)
nvgpu.MBarrierTryWaitParityOp(
mbarTMA, phaseParity, ticks, mbarId=stage
)
debug_print(
"[cons] iv={} | mbarTMA[{}] try_wait phase={} [done]",
iv,
stage,
phaseParity,
predicate=consumerPrimaryThread,
)
# Step 6.3.2. Create WGMMA Descriptors
a_offset = arith.muli(stage, c(lhs_tile_bytes))
a_tile_slice = memref.view(
ir.MemRefType.get(
a_tile_shape, a_elem_ty, memory_space=smem_space
),
dynamic_smem,
a_offset,
[],
)
b_offset = arith.addi(
arith.muli(stage, c(rhs_tile_bytes)),
c(lhs_tile_bytes * num_stages),
)
b_tile_slice = memref.view(
ir.MemRefType.get(
b_tile_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset,
[],
)
debug_print(
"[cons] a_offset={} b_offset={}",
a_offset,
b_offset,
predicate=consumerPrimaryThread,
)
da = nvgpu.WarpgroupGenerateDescriptorOp(
a_wgmma_ty, a_tile_slice, a_tma_desc_op
)
db = nvgpu.WarpgroupGenerateDescriptorOp(
b_wgmma_ty, b_tile_slice, b_tma_desc_op
)
# Step 6.3.3. MMA
carry_acc = for_op.inner_iter_args[0]
new_acc = nvgpu.WarpgroupMmaOp(
acc.type, da, db, carry_acc, transposeB=True
)
# Step 6.3.4. Arrive mbarDONE
if num_stages == 1:
p_arrive = consumerPrimaryThread
else:
p1 = arith.cmpi(arith.CmpIPredicate.sgt, iv, c(0))
p_arrive = arith.andi(consumerPrimaryThread, p1)
with ir.InsertionPoint(scf.IfOp(p_arrive).then_block):
p = arith.cmpi(arith.CmpIPredicate.eq, stage, c(0))
barId = arith.select(
p, c(num_stages - 1), arith.subi(stage, c(1))
)
debug_print(
"[cons] iv={} | mbarDONE[{}] arrive ",
iv,
barId,
predicate=consumerPrimaryThread,
)
nvgpu.mbarrier_arrive(
ir.Type.parse("!nvgpu.mbarrier.token"), mbarDONE, barId
)
debug_print(
"[cons] iv={} | mbarDONE[{}] arrive [done]",
iv,
barId,
predicate=consumerPrimaryThread,
)
scf.yield_([])
p = arith.cmpi(arith.CmpIPredicate.eq, stage, c(num_stages - 1))
phaseParity = arith.select(
p,
arith.xori(phaseParity, arith.constant(T.bool(), 1)),
phaseParity,
)
# Step 6.3.5. Yield
scf.yield_([new_acc, phaseParity])
# Step 6.3. Wait All WGMMA
nvvm.WgmmaWaitGroupSyncOp(0)
with ir.InsertionPoint(scf.IfOp(consumerPrimaryThread).then_block):
barId = c((K // BLOCK_K) % num_stages)
nvgpu.mbarrier_arrive(
ir.Type.parse("!nvgpu.mbarrier.token"), mbarDONE, barId
)
scf.yield_([])
# Step 6.4. Epilogue (registers --> shared memory)
acc_smem_ty = ir.MemRefType.get(
(BLOCK_M, BLOCK_N), c_elem_ty, memory_space=smem_space
)
acc_smem = memref.view(acc_smem_ty, dynamic_smem, c(0), [])
debug_print("[cons] | Storing", predicate=consumerPrimaryThread)
nvgpu.WarpgroupMmaStoreOp(for_op.results[0], acc_smem)
scf.yield_([])
gpu.barrier()
# GPU Step 9. Epilogue (shared memory --> global memory)
fd = ir.MemRefType.get(
[BLOCK_M * BLOCK_N], c_elem_ty, memory_space=smem_space
)
collapsed_smem = memref.view(fd, dynamic_smem, c(0), [])
rty = ir.MemRefType.get(
(BLOCK_M, BLOCK_N),
c_elem_ty,
ir.Attribute.parse("strided<[" + str(N) + ", 1], offset: ?>"),
)
c_device_per_block = memref.SubViewOp(
rty,
c_device,
[dimX, dimY],
[],
[],
[MLIR_DYNAMIC, MLIR_DYNAMIC],
[BLOCK_M, BLOCK_N],
[1, 1],
)
vlen = 1
for_op = scf.ForOp(
tidx, c(BLOCK_M * BLOCK_N), c(vlen * WARP_GROUP_SIZE * 2)
)
with ir.InsertionPoint(for_op.body):
x = arith.divui(for_op.induction_variable, c(BLOCK_M))
y = arith.remui(for_op.induction_variable, c(BLOCK_N))
vdata = vector.load(
ir.VectorType.get((vlen,), c_elem_ty),
collapsed_smem,
[for_op.induction_variable],
)
vector.store(vdata, c_device_per_block, [x, y])
scf.yield_([])
gpu.terminator()
# Step 4. Copy back to host
t8 = gpu.wait(token_ty, [launch_op])
t9 = gpu.memcpy(token_ty, [t8], c_host, c_device)
gpu.dealloc(token_ty, [t8], a_device)
gpu.dealloc(token_ty, [t8], b_device)
gpu.wait(token_ty, [t9])
gpu.dealloc(token_ty, [t8], c_device)
func.ReturnOp([])
fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
module.operation.verify()
return module
def generate_matmul_multistage(
input_type=np.float16,
output_type=np.float32,
M=4096,
N=4096,
K=4096,
BLOCK_M=128,
BLOCK_N=128,
BLOCK_K=64,
num_stages=3,
):
# Limitaitons for now
assert input_type == np.float16
assert output_type == np.float32
assert BLOCK_M == 128
assert BLOCK_N == 128
assert BLOCK_K == 64
assert M % BLOCK_M == 0
assert N % BLOCK_N == 0
assert K % BLOCK_K == 0
module = ir.Module.create()
token_ty = gpu.AsyncTokenType.get()
a_elem_ty = get_mlir_ty(input_type)
b_elem_ty = get_mlir_ty(input_type)
c_elem_ty = get_mlir_ty(output_type)
a_ty = ir.MemRefType.get([M, K], a_elem_ty)
b_ty = ir.MemRefType.get((K, N), b_elem_ty)
c_ty = ir.MemRefType.get((M, N), c_elem_ty)
a_tile_shape = a_tma_shape = (BLOCK_M, TMA_LAST_DIM_F16)
b_tma_shape = (BLOCK_K, TMA_LAST_DIM_F16)
b_tile_shape = (BLOCK_K, BLOCK_N)
txcount = (b_tile_shape[0] * b_tile_shape[1] * get_type_size(a_elem_ty)) + (
a_tile_shape[0] * a_tile_shape[1] * get_type_size(b_elem_ty)
)
smem_space_str = "#gpu.address_space<workgroup>"
smem_space = ir.Attribute.parse(smem_space_str)
mbar_ty = ir.Type.parse(
"!nvgpu.mbarrier.group<memorySpace = "
+ str(smem_space)
+ ", num_barriers = "
+ str(num_stages)
+ ">"
)
acc_ty = ir.Type.parse(
"!nvgpu.warpgroup.accumulator<fragmented=vector<"
+ str(BLOCK_M)
+ "x"
+ str(BLOCK_N)
+ "x"
+ str(c_elem_ty)
+ ">>"
)
a_wgmma_ty = ir.Type.parse(
"!nvgpu.warpgroup.descriptor<tensor=memref<"
+ str(BLOCK_M)
+ "x"
+ str(BLOCK_K)
+ "x"
+ str(a_elem_ty)
+ ", "
+ smem_space_str
+ ">>"
)
b_wgmma_ty = ir.Type.parse(
"!nvgpu.warpgroup.descriptor<tensor=memref<"
+ str(BLOCK_K)
+ "x"
+ str(BLOCK_N)
+ "x"
+ str(a_elem_ty)
+ ", "
+ smem_space_str
+ ">>"
)
with ir.InsertionPoint(module.body):
kernelName = make_kernel_name(
input_type,
output_type,
M,
N,
K,
BLOCK_M,
BLOCK_N,
BLOCK_K,
num_stages,
False,
)
fop = func.FuncOp(kernelName, ([a_ty, b_ty, c_ty], []))
with ir.InsertionPoint(fop.add_entry_block()):
a_host = fop.arguments[0]
b_host = fop.arguments[1]
c_host = fop.arguments[2]
lhs_tile_bytes = BLOCK_M * BLOCK_K * get_type_size(a_elem_ty)
rhs_tile_bytes = BLOCK_N * BLOCK_K * get_type_size(b_elem_ty)
smem_size_input = (lhs_tile_bytes + rhs_tile_bytes) * num_stages
smem_size_output = BLOCK_M * BLOCK_N * get_type_size(c_elem_ty)
smem_size = max(smem_size_input, smem_size_output)
# Step 1. Allocate device memory and memcpy
t1 = gpu.wait(token_ty, [])
a_device, t2 = gpu.alloc(a_ty, token_ty, [t1], [], [])
b_device, t3 = gpu.alloc(b_ty, token_ty, [t2], [], [])
c_device, t4 = gpu.alloc(c_ty, token_ty, [t3], [], [])
t5 = gpu.memcpy(token_ty, [t4], a_device, a_host)
t6 = gpu.memcpy(token_ty, [t5], b_device, b_host)
t7 = gpu.wait(token_ty, [t6])
# Step 2. Create TMA Descriptors
a_tma_desc = TmaDescriptorBuilder(
nvgpu.TensorMapSwizzleKind.SWIZZLE_128B,
nvgpu.TensorMapL2PromoKind.L2PROMO_NONE,
nvgpu.TensorMapOOBKind.OOB_ZERO,
nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE,
a_tma_shape,
a_ty,
)
b_tma_desc = TmaDescriptorBuilder(
nvgpu.TensorMapSwizzleKind.SWIZZLE_128B,
nvgpu.TensorMapL2PromoKind.L2PROMO_NONE,
nvgpu.TensorMapOOBKind.OOB_ZERO,
nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE,
b_tma_shape,
b_ty,
)
a_tma_desc_op = a_tma_desc.tma_descriptor_op(a_device)
b_tma_desc_op = b_tma_desc.tma_descriptor_op(b_device)
# Step 3. Launch Kernel with 1 Warpgroup
cta_m = M // BLOCK_M
cta_n = N // BLOCK_N
assert M % BLOCK_M == 0 and N % BLOCK_N == 0
grid = (cta_m, cta_n, 1)
block = (WARP_GROUP_SIZE, 1, 1)
launch_op = gpu.LaunchOp(
token_ty,
[t7],
*map(c, grid),
*map(c, block),
dynamicSharedMemorySize=c(smem_size, ty=T.i32()),
)
launch_op.body.blocks.append(*([T.index()] * 12))
with ir.InsertionPoint(launch_op.body.blocks[0]):
# GPU Step 0. Bootstrapping
memref.assume_alignment(c_device, 16)
dynamic_smem = gpu.dynamic_shared_memory(
ir.MemRefType.get((MLIR_DYNAMIC,), T.i8(), memory_space=smem_space)
)
ticks = c(10000000)
tidx = gpu.thread_id(gpu.Dimension.x)
primaryThread = arith.cmpi(arith.CmpIPredicate.eq, tidx, c(0))
warpId = arith.divui(tidx, c(32))
bidx = gpu.block_id(gpu.Dimension.x)
bidy = gpu.block_id(gpu.Dimension.y)
dimX = arith.muli(bidx, c(BLOCK_M))
dimY = arith.muli(bidy, c(BLOCK_N))
# GPU Step 1. Initialize mbarrier groups
mbarTMA = nvgpu.mbarrier_create(mbar_ty)
for i in range(num_stages):
nvgpu.mbarrier_init(mbarTMA, c(1), c(i), predicate=primaryThread)
gpu.barrier()
# GPU Step 2. Prefetch TMA descriptors
nvgpu.tma_prefetch_descriptor(a_tma_desc_op, predicate=primaryThread)
nvgpu.tma_prefetch_descriptor(b_tma_desc_op, predicate=primaryThread)
# GPU Step 3. Prologue (global memory --> shared memory)
ns = num_stages if num_stages == 1 else num_stages - 1
for_op = scf.ForOp(c(0), c(ns), c(1))
with ir.InsertionPoint(for_op.body):
iv = for_op.induction_variable
# Step 3.1. Calculate offsets
a_offset = arith.muli(iv, c(lhs_tile_bytes))
a_tma_slice = memref.view(
ir.MemRefType.get(
a_tma_shape, a_elem_ty, memory_space=smem_space
),
dynamic_smem,
a_offset,
[],
)
b_offset = arith.addi(
arith.muli(iv, c(rhs_tile_bytes)),
c(lhs_tile_bytes * num_stages),
)
b_tma_slice_1 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset,
[],
)
b_offset2 = arith.addi(
b_offset,
c(BLOCK_K * TMA_LAST_DIM_F16 * get_type_size(b_elem_ty)),
)
b_tma_slice_2 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset2,
[],
)
# Step 3.2. TMA Load
coord = arith.muli(c(64), iv)
dimY2 = arith.addi(dimY, c(64))
debug_print(
"[Prologue] TMA Load a_offset={} b_offset={} b_offset2={} @ a=({},{}) b=({},{})",
a_offset,
b_offset,
b_offset2,
coord,
dimX,
dimY,
coord,
predicate=primaryThread,
)
nvgpu.TmaAsyncLoadOp(
a_tma_slice,
mbarTMA,
a_tma_desc_op,
coordinates=[coord, dimX],
mbarId=iv,
predicate=primaryThread,
)
nvgpu.TmaAsyncLoadOp(
b_tma_slice_1,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY, coord],
mbarId=iv,
predicate=primaryThread,
)
nvgpu.TmaAsyncLoadOp(
b_tma_slice_2,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY2, coord],
mbarId=iv,
predicate=primaryThread,
)
# Step 3.2. mbarTMA arrive
debug_print(
"[Prologue] mbarTMA[{}] arrive", iv, predicate=primaryThread
)
nvgpu.mbarrier_arrive_expect_tx(
mbarTMA, c(txcount), iv, predicate=primaryThread
)
debug_print(
"[Prologue] mbarTMA[{}] arrive [done]",
iv,
predicate=primaryThread,
)
scf.yield_([])
# GPU Step 4. Main Loop
acc = nvgpu.warpgroup_mma_init_accumulator(acc_ty)
for_op = scf.ForOp(
c(0), c(K // BLOCK_K), c(1), [acc, arith.constant(T.bool(), 0)]
)
with ir.InsertionPoint(for_op.body):
# Step 4.1. Wait mbarTMA
phaseParity = for_op.inner_iter_args[1]
iv = for_op.induction_variable
stage = arith.remui(iv, c(num_stages))
debug_print(
"[MainLoop] mbarTMA[{}] try_wait phase={}",
stage,
phaseParity,
predicate=primaryThread,
)
nvgpu.MBarrierTryWaitParityOp(
mbarTMA, phaseParity, ticks, mbarId=stage
)
debug_print(
"[MainLoop] mbarTMA[{}] try_wait phase={} [done]",
stage,
phaseParity,
predicate=primaryThread,
)
# Step 4.2. Create WGMMA Descriptors
a_offset = arith.muli(stage, c(lhs_tile_bytes))
a_tile_slice = memref.view(
ir.MemRefType.get(
a_tile_shape, a_elem_ty, memory_space=smem_space
),
dynamic_smem,
a_offset,
[],
)
b_offset = arith.addi(
arith.muli(stage, c(rhs_tile_bytes)),
c(lhs_tile_bytes * num_stages),
)
b_tile_slice = memref.view(
ir.MemRefType.get(
b_tile_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset,
[],
)
debug_print(
"[MainLoop] iv={} MMA a_offset={} b_offset={}",
iv,
a_offset,
b_offset,
predicate=primaryThread,
)
da = nvgpu.WarpgroupGenerateDescriptorOp(
a_wgmma_ty, a_tile_slice, a_tma_desc_op
)
db = nvgpu.WarpgroupGenerateDescriptorOp(
b_wgmma_ty, b_tile_slice, b_tma_desc_op
)
# Step 4.3. MMA
carry_acc = for_op.inner_iter_args[0]
new_acc = nvgpu.WarpgroupMmaOp(
acc.type, da, db, carry_acc, transposeB=True
)
if num_stages == 1:
nvvm.WgmmaWaitGroupSyncOp(0)
# Step 4.4. Load TMA for next stage
p1 = arith.cmpi(
arith.CmpIPredicate.ult,
arith.addi(iv, c(ns)),
c(K // BLOCK_K),
)
p = arith.andi(primaryThread, p1)
nextStage = arith.addi(iv, c(ns))
nextSlot = arith.remui(nextStage, c(num_stages))
a_offset = arith.muli(nextSlot, c(lhs_tile_bytes))
debug_print(
"[MainLoop] mbarTMA[{}] arrive",
nextSlot,
predicate=p,
)
nvgpu.mbarrier_arrive_expect_tx(
mbarTMA, c(txcount), nextSlot, predicate=p
)
debug_print(
"[MainLoop] mbarTMA[{}] arrive [done]",
nextSlot,
predicate=p,
)
a_tma_slice = memref.view(
ir.MemRefType.get(
a_tma_shape, a_elem_ty, memory_space=smem_space
),
dynamic_smem,
a_offset,
[],
)
b_offset = arith.addi(
arith.muli(nextSlot, c(rhs_tile_bytes)),
c(lhs_tile_bytes * num_stages),
)
b_tma_slice_1 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset,
[],
)
b_offset2 = arith.addi(
b_offset,
c(BLOCK_K * TMA_LAST_DIM_F16 * get_type_size(b_elem_ty)),
)
b_tma_slice_2 = memref.view(
ir.MemRefType.get(
b_tma_shape, b_elem_ty, memory_space=smem_space
),
dynamic_smem,
b_offset2,
[],
)
coord = arith.muli(c(64), nextStage)
debug_print(
"[MainLoop] iv={} TMA Load a_offset={} b_offset={} b_offset2={} @ a=({},{}) b=({},{})",
iv,
a_offset,
b_offset,
b_offset2,
coord,
dimX,
dimY,
coord,
predicate=p,
)
nvgpu.TmaAsyncLoadOp(
a_tma_slice,
mbarTMA,
a_tma_desc_op,
coordinates=[coord, dimX],
mbarId=nextSlot,
predicate=p,
)
nvgpu.TmaAsyncLoadOp(
b_tma_slice_1,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY, coord],
mbarId=nextSlot,
predicate=p,
)
dimY2 = arith.addi(dimY, c(64))
nvgpu.TmaAsyncLoadOp(
b_tma_slice_2,
mbarTMA,
b_tma_desc_op,
coordinates=[dimY2, coord],
mbarId=nextSlot,
predicate=p,
)
# Step 4.5. Change the phaseParity
p = arith.cmpi(arith.CmpIPredicate.eq, stage, c(num_stages - 1))
phaseParity = arith.select(
p,
arith.xori(phaseParity, arith.constant(T.bool(), 1)),
phaseParity,
)
# Step 4.5. Yield
scf.yield_([new_acc, phaseParity])
# Step 5. Wait All WGMMA groups
nvvm.WgmmaWaitGroupSyncOp(0)
# Step 6. Epilogue (registers --> shared memory)
acc_smem_ty = ir.MemRefType.get(
(BLOCK_M, BLOCK_N), c_elem_ty, memory_space=smem_space
)
acc_smem = memref.view(acc_smem_ty, dynamic_smem, c(0), [])
debug_print("Storing", predicate=primaryThread)
nvgpu.WarpgroupMmaStoreOp(for_op.results[0], acc_smem)
gpu.barrier()
# GPU Step 7. Epilogue (shared memory --> global memory)
fd = ir.MemRefType.get(
[BLOCK_M * BLOCK_N], c_elem_ty, memory_space=smem_space
)
collapsed_smem = memref.view(fd, dynamic_smem, c(0), [])
rty = ir.MemRefType.get(
(BLOCK_M, BLOCK_N),
c_elem_ty,
ir.Attribute.parse("strided<[" + str(N) + ", 1], offset: ?>"),
)
c_device_per_block = memref.SubViewOp(
rty,
c_device,
[dimX, dimY],
[],
[],
[MLIR_DYNAMIC, MLIR_DYNAMIC],
[BLOCK_M, BLOCK_N],
[1, 1],
)
vlen = 1
for_op = scf.ForOp(
tidx, c(BLOCK_M * BLOCK_N), c(vlen * WARP_GROUP_SIZE)
)
with ir.InsertionPoint(for_op.body):
x = arith.divui(for_op.induction_variable, c(BLOCK_M))
y = arith.remui(for_op.induction_variable, c(BLOCK_N))
vdata = vector.load(
ir.VectorType.get((vlen,), c_elem_ty),
collapsed_smem,
[for_op.induction_variable],
)
vector.store(vdata, c_device_per_block, [x, y])
scf.yield_([])
gpu.terminator()
# Step 4. Copy back to host
t8 = gpu.wait(token_ty, [launch_op])
t9 = gpu.memcpy(token_ty, [t8], c_host, c_device)
gpu.dealloc(token_ty, [t8], a_device)
gpu.dealloc(token_ty, [t8], b_device)
gpu.wait(token_ty, [t9])
gpu.dealloc(token_ty, [t8], c_device)
func.ReturnOp([])
fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
module.operation.verify()
return module