llvm/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s

// REQUIRES: amdgpu-registered-target

typedef unsigned int uint;

// CHECK-LABEL: @test_s_sleep_var(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
// CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 [[TMP0]])
// CHECK-NEXT:    call void @llvm.amdgcn.s.sleep.var(i32 15)
// CHECK-NEXT:    ret void
//
void test_s_sleep_var(int d)
{
  __builtin_amdgcn_s_sleep_var(d);
  __builtin_amdgcn_s_sleep_var(15);
}

// CHECK-LABEL: @test_permlane16_var(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT:    ret void
//
void test_permlane16_var(global uint* out, uint a, uint b, uint c) {
  *out = __builtin_amdgcn_permlane16_var(a, b, c, 0, 0);
}

// CHECK-LABEL: @test_permlanex16_var(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlanex16.var(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false, i1 false)
// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT:    ret void
//
void test_permlanex16_var(global uint* out, uint a, uint b, uint c) {
  *out = __builtin_amdgcn_permlanex16_var(a, b, c, 0, 0);
}

// CHECK-LABEL: @test_s_barrier_signal(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal(i32 -1)
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 -1)
// CHECK-NEXT:    ret void
//
void test_s_barrier_signal()
{
  __builtin_amdgcn_s_barrier_signal(-1);
  __builtin_amdgcn_s_barrier_wait(-1);
}

// CHECK-LABEL: @test_s_barrier_signal_var(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
// CHECK-NEXT:    ret void
//
void test_s_barrier_signal_var(int a)
{
  __builtin_amdgcn_s_barrier_signal_var(a);
}

// CHECK-LABEL: @test_s_barrier_signal_isfirst(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
// CHECK:       if.then:
// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END:%.*]]
// CHECK:       if.else:
// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END]]
// CHECK:       if.end:
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
// CHECK-NEXT:    ret void
//
void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
{
  if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
    a = b;
  else
    a = c;

  __builtin_amdgcn_s_barrier_wait(1);
}

// CHECK-LABEL: @test_s_barrier_isfirst_var(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
// CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
// CHECK:       if.then:
// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END:%.*]]
// CHECK:       if.else:
// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END]]
// CHECK:       if.end:
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.wait(i16 1)
// CHECK-NEXT:    ret void
//
void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
{
 if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
    a = b;
  else
    a = c;

  __builtin_amdgcn_s_barrier_wait(1);

}

// CHECK-LABEL: @test_s_barrier_init(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
// CHECK-NEXT:    ret void
//
void test_s_barrier_init(int a)
{
  __builtin_amdgcn_s_barrier_init(1, a);
}

// CHECK-LABEL: @test_s_barrier_join(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(i32 1)
// CHECK-NEXT:    ret void
//
void test_s_barrier_join()
{
  __builtin_amdgcn_s_barrier_join(1);
}

// CHECK-LABEL: @test_s_wakeup_barrier(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(i32 1)
// CHECK-NEXT:    ret void
//
void test_s_wakeup_barrier()
{
  __builtin_amdgcn_s_barrier_join(1);
}

// CHECK-LABEL: @test_s_barrier_leave(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
// CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
// CHECK:       if.then:
// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END:%.*]]
// CHECK:       if.else:
// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
// CHECK-NEXT:    store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
// CHECK-NEXT:    br label [[IF_END]]
// CHECK:       if.end:
// CHECK-NEXT:    ret void
//
void test_s_barrier_leave(int* a, int* b, int *c)
{
  if (__builtin_amdgcn_s_barrier_leave())
    a = b;
  else
    a = c;
}

// CHECK-LABEL: @test_s_get_barrier_state(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
// CHECK-NEXT:    ret i32 [[TMP2]]
//
unsigned test_s_get_barrier_state(int a)
{
  unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
  return State;
}

// CHECK-LABEL: @test_s_ttracedata(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    call void @llvm.amdgcn.s.ttracedata(i32 1)
// CHECK-NEXT:    ret void
//
void test_s_ttracedata()
{
  __builtin_amdgcn_s_ttracedata(1);
}

// CHECK-LABEL: @test_s_ttracedata_imm(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    call void @llvm.amdgcn.s.ttracedata.imm(i16 1)
// CHECK-NEXT:    ret void
//
void test_s_ttracedata_imm()
{
  __builtin_amdgcn_s_ttracedata_imm(1);
}

// CHECK-LABEL: @test_s_prefetch_data(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT:    [[GP_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT:    [[CP_ADDR:%.*]] = alloca ptr addrspace(4), align 8, addrspace(5)
// CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store ptr [[FP:%.*]], ptr addrspace(5) [[FP_ADDR]], align 8
// CHECK-NEXT:    store ptr addrspace(1) [[GP:%.*]], ptr addrspace(5) [[GP_ADDR]], align 8
// CHECK-NEXT:    store ptr addrspace(4) [[CP:%.*]], ptr addrspace(5) [[CP_ADDR]], align 8
// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p0(ptr [[TMP0]], i32 0)
// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GP_ADDR]], align 8
// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p1(ptr addrspace(1) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(5) [[CP_ADDR]], align 8
// CHECK-NEXT:    call void @llvm.amdgcn.s.prefetch.data.p4(ptr addrspace(4) [[TMP3]], i32 31)
// CHECK-NEXT:    ret void
//
void test_s_prefetch_data(int *fp, global float *gp, constant char *cp, unsigned int len)
{
  __builtin_amdgcn_s_prefetch_data(fp, 0);
  __builtin_amdgcn_s_prefetch_data(gp, len);
  __builtin_amdgcn_s_prefetch_data(cp, 31);
}

// CHECK-LABEL: @test_s_buffer_prefetch_data(
// CHECK-NEXT:  entry:
// CHECK-NEXT:    [[RSRC_ADDR:%.*]] = alloca ptr addrspace(8), align 16, addrspace(5)
// CHECK-NEXT:    [[LEN_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT:    store ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT:    store i32 [[LEN:%.*]], ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[LEN_ADDR]], align 4
// CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP0]], i32 128, i32 [[TMP1]])
// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[RSRC_ADDR]], align 16
// CHECK-NEXT:    call void @llvm.amdgcn.s.buffer.prefetch.data(ptr addrspace(8) [[TMP2]], i32 0, i32 31)
// CHECK-NEXT:    ret void
//
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int len)
{
  __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 128, len);
  __builtin_amdgcn_s_buffer_prefetch_data(rsrc, 0, 31);
}