llvm/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx83 | %ptxas-verify -arch=sm_70 %}

target triple = "nvptx64-nvidia-cuda"

@value = internal addrspace(1) global i128 0, align 16

define void @test_b128_input_from_const() {
; CHECK-LABEL: test_b128_input_from_const(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<5>;
; CHECK-NEXT:    .reg .b128 %rq<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    mov.u64 %rd2, 0;
; CHECK-NEXT:    mov.u64 %rd3, 42;
; CHECK-NEXT:    mov.b128 %rq1, {%rd3, %rd2};
; CHECK-NEXT:    mov.u64 %rd4, value;
; CHECK-NEXT:    cvta.global.u64 %rd1, %rd4;
; CHECK-NEXT:    // begin inline asm
; CHECK-NEXT:    { st.b128 [%rd1], %rq1; }
; CHECK-NEXT:    // end inline asm
; CHECK-NEXT:    ret;
  tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 42)
  ret void
}

define void @test_b128_input_from_load(ptr nocapture readonly %data) {
; CHECK-LABEL: test_b128_input_from_load(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-NEXT:    .reg .b128 %rq<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u64 %rd2, [test_b128_input_from_load_param_0];
; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
; CHECK-NEXT:    ld.global.u64 %rd4, [%rd3+8];
; CHECK-NEXT:    ld.global.u64 %rd5, [%rd3];
; CHECK-NEXT:    mov.b128 %rq1, {%rd5, %rd4};
; CHECK-NEXT:    mov.u64 %rd6, value;
; CHECK-NEXT:    cvta.global.u64 %rd1, %rd6;
; CHECK-NEXT:    // begin inline asm
; CHECK-NEXT:    { st.b128 [%rd1], %rq1; }
; CHECK-NEXT:    // end inline asm
; CHECK-NEXT:    ret;
  %1 = addrspacecast ptr %data to ptr addrspace(1)
  %2 = load <2 x i64>, ptr addrspace(1) %1, align 16
  %3 = bitcast <2 x i64> %2 to i128
  tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %3)
  ret void
}

define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
; CHECK-LABEL: test_b128_input_from_select(
; CHECK:       {
; CHECK-NEXT:    .reg .pred %p<2>;
; CHECK-NEXT:    .reg .b16 %rs<2>;
; CHECK-NEXT:    .reg .b64 %rd<7>;
; CHECK-NEXT:    .reg .b128 %rq<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u64 %rd2, [test_b128_input_from_select_param_0];
; CHECK-NEXT:    cvta.to.global.u64 %rd3, %rd2;
; CHECK-NEXT:    ld.global.u8 %rs1, [%rd3];
; CHECK-NEXT:    setp.eq.s16 %p1, %rs1, 0;
; CHECK-NEXT:    selp.b64 %rd4, 24, 42, %p1;
; CHECK-NEXT:    mov.u64 %rd5, 0;
; CHECK-NEXT:    mov.b128 %rq1, {%rd4, %rd5};
; CHECK-NEXT:    mov.u64 %rd6, value;
; CHECK-NEXT:    cvta.global.u64 %rd1, %rd6;
; CHECK-NEXT:    // begin inline asm
; CHECK-NEXT:    { st.b128 [%rd1], %rq1; }
; CHECK-NEXT:    // end inline asm
; CHECK-NEXT:    ret;
  %1 = addrspacecast ptr %flag to ptr addrspace(1)
  %2 = load i8, ptr addrspace(1) %1, align 1
  %3 = icmp eq i8 %2, 0
  %4 = select i1 %3, i128 24, i128 42
  tail call void asm sideeffect "{ st.b128 [$0], $1; }", "l,q"(ptr nonnull addrspacecast (ptr addrspace(1) @value to ptr), i128 %4)
  ret void
}

define void @test_store_b128_output() {
; CHECK-LABEL: test_store_b128_output(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<5>;
; CHECK-NEXT:    .reg .b128 %rq<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    // begin inline asm
; CHECK-NEXT:    { mov.b128 %rq1, 41; }
; CHECK-NEXT:    // end inline asm
; CHECK-NEXT:    mov.b128 {%rd1, %rd2}, %rq1;
; CHECK-NEXT:    add.cc.s64 %rd3, %rd1, 1;
; CHECK-NEXT:    addc.cc.s64 %rd4, %rd2, 0;
; CHECK-NEXT:    st.global.u64 [value+8], %rd4;
; CHECK-NEXT:    st.global.u64 [value], %rd3;
; CHECK-NEXT:    ret;
  %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"()
  %add = add nsw i128 %1, 1
  %2 = bitcast i128 %add to <2 x i64>
  store <2 x i64> %2, ptr addrspace(1) @value, align 16
  ret void
}

define void @test_use_of_b128_output(ptr nocapture readonly %data) {
; CHECK-LABEL: test_use_of_b128_output(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<9>;
; CHECK-NEXT:    .reg .b128 %rq<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u64 %rd1, [test_use_of_b128_output_param_0];
; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; CHECK-NEXT:    ld.global.u64 %rd3, [%rd2+8];
; CHECK-NEXT:    ld.global.u64 %rd4, [%rd2];
; CHECK-NEXT:    mov.b128 %rq2, {%rd4, %rd3};
; CHECK-NEXT:    // begin inline asm
; CHECK-NEXT:    { mov.b128 %rq1, %rq2; }
; CHECK-NEXT:    // end inline asm
; CHECK-NEXT:    mov.b128 {%rd5, %rd6}, %rq1;
; CHECK-NEXT:    add.cc.s64 %rd7, %rd5, 1;
; CHECK-NEXT:    addc.cc.s64 %rd8, %rd6, 0;
; CHECK-NEXT:    st.global.u64 [value], %rd7;
; CHECK-NEXT:    st.global.u64 [value+8], %rd8;
; CHECK-NEXT:    ret;
  %1 = addrspacecast ptr %data to ptr addrspace(1)
  %2 = load <2 x i64>, ptr addrspace(1) %1, align 16
  %3 = bitcast <2 x i64> %2 to i128
  %4 = tail call i128 asm "{ mov.b128 $0, $1; }", "=q,q"(i128 %3)
  %add = add nsw i128 %4, 1
  %5 = bitcast i128 %add to <2 x i64>
  store <2 x i64> %5, ptr addrspace(1) @value, align 16
  ret void
}