llvm/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s

define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  store i32 %load, ptr addrspace(1) %out
  ret void
}

define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  %load1 = load i32, ptr addrspace(1) %in1
  store i32 %load, ptr addrspace(1) %out
  store i32 %load1, ptr addrspace(1) %out1
  ret void
}

define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT:    [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
; NO-PRELOAD-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
; NO-PRELOAD-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
; PRELOAD-1-NEXT:    [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
; PRELOAD-1-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
; PRELOAD-1-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
; PRELOAD-1-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; PRELOAD-1-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
; PRELOAD-1-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
; PRELOAD-3-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
; PRELOAD-3-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
; PRELOAD-3-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
; PRELOAD-3-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-3-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
; PRELOAD-3-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
; PRELOAD-8-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-8-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
; PRELOAD-8-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  %load1 = load i32, ptr addrspace(1) %in1
  %load2 = load i32, ptr addrspace(1) %in2
  %load3 = load i32, ptr addrspace(1) %in3
  store i32 %load, ptr addrspace(1) %out
  store i32 %load1, ptr addrspace(1) %out1
  store i32 %load2, ptr addrspace(1) %out2
  store i32 %load3, ptr addrspace(1) %out3
  ret void
}

; Preload args with inreg in the NO-PRELOAD case.

define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  %load1 = load i32, ptr addrspace(1) %in1
  store i32 %load, ptr addrspace(1) %out
  store i32 %load1, ptr addrspace(1) %out1
  ret void
}

; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument.

define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  %load1 = load i32, ptr addrspace(1) %in1
  store i32 %load, ptr addrspace(1) %out
  store i32 %load1, ptr addrspace(1) %out1
  ret void
}

define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; NO-PRELOAD-NEXT:    [[EXT:%.*]] = zext i16 [[TMP2]] to i32
; NO-PRELOAD-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
; NO-PRELOAD-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
; PRELOAD-1-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-1-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
; PRELOAD-1-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
; PRELOAD-3-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-3-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-3-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
; PRELOAD-3-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
; PRELOAD-8-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-8-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
; PRELOAD-8-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %load = load i32, ptr addrspace(1) %in
  %load1 = load i32, ptr addrspace(1) %in1
  %ext = zext i16 %arg0 to i32
  %add = add i32 %load, %ext
  store i32 %add, ptr addrspace(1) %out
  store i32 %load1, ptr addrspace(1) %out1
  ret void
}

; In this case both i16 args with be preloaded into the first SGPR.

define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
; NO-PRELOAD-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
; NO-PRELOAD-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; NO-PRELOAD-NEXT:    [[EXT:%.*]] = zext i16 [[TMP2]] to i32
; NO-PRELOAD-NEXT:    [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
; NO-PRELOAD-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
; NO-PRELOAD-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT:    ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-1-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
; PRELOAD-1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
; PRELOAD-1-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
; PRELOAD-1-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-1-NEXT:    [[EXT1:%.*]] = zext i16 [[TMP3]] to i32
; PRELOAD-1-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
; PRELOAD-1-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; PRELOAD-1-NEXT:    ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-3-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-3-NEXT:    [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-3-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
; PRELOAD-3-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-3-NEXT:    ret void
;
; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-8-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
; PRELOAD-8-NEXT:    [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
; PRELOAD-8-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
; PRELOAD-8-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
; PRELOAD-8-NEXT:    ret void
;
  %ext = zext i16 %arg0 to i32
  %ext1 = zext i16 %arg1 to i32
  %add = add i32 %ext, %ext1
  store i32 %add, ptr addrspace(1) %out
  ret void
}

attributes #0 = { nounwind }