llvm/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll

; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
;
; Check that parameters of a __global__ (kernel) function do not get increased
; alignment, and no additional vectorization is performed on loads/stores with
; that parameters.
;
; Test IR is a minimized version of IR generated with the following command
; from the source code below:
; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
;
; ----------------------------------------------------------------------------
; #include <stdint.h>
;
; struct St4x1 { uint32_t field[1]; };
; struct St4x2 { uint32_t field[2]; };
; struct St4x3 { uint32_t field[3]; };
; struct St4x4 { uint32_t field[4]; };
; struct St4x5 { uint32_t field[5]; };
; struct St4x6 { uint32_t field[6]; };
; struct St4x7 { uint32_t field[7]; };
; struct St4x8 { uint32_t field[8]; };
; struct St8x1 { uint64_t field[1]; };
; struct St8x2 { uint64_t field[2]; };
; struct St8x3 { uint64_t field[3]; };
; struct St8x4 { uint64_t field[4]; };
;
; #define DECLARE_FUNCTION(StName)                                    \
; static __global__  __attribute__((noinline))                        \
; void foo_##StName(struct StName in, struct StName* ret) {           \
;   const unsigned size = sizeof(ret->field) / sizeof(*ret->field);   \
;   for (unsigned i = 0; i != size; ++i)                              \
;     ret->field[i] = in.field[i];                                    \
; }                                                                   \
;
; DECLARE_FUNCTION(St4x1)
; DECLARE_FUNCTION(St4x2)
; DECLARE_FUNCTION(St4x3)
; DECLARE_FUNCTION(St4x4)
; DECLARE_FUNCTION(St4x5)
; DECLARE_FUNCTION(St4x6)
; DECLARE_FUNCTION(St4x7)
; DECLARE_FUNCTION(St4x8)
; DECLARE_FUNCTION(St8x1)
; DECLARE_FUNCTION(St8x2)
; DECLARE_FUNCTION(St8x3)
; DECLARE_FUNCTION(St8x4)
; ----------------------------------------------------------------------------

%struct.St4x1 = type { [1 x i32] }
%struct.St4x2 = type { [2 x i32] }
%struct.St4x3 = type { [3 x i32] }
%struct.St4x4 = type { [4 x i32] }
%struct.St4x5 = type { [5 x i32] }
%struct.St4x6 = type { [6 x i32] }
%struct.St4x7 = type { [7 x i32] }
%struct.St4x8 = type { [8 x i32] }
%struct.St8x1 = type { [1 x i64] }
%struct.St8x2 = type { [2 x i64] }
%struct.St8x3 = type { [3 x i64] }
%struct.St8x4 = type { [4 x i64] }

define dso_local void @foo_St4x1(ptr nocapture noundef readonly byval(%struct.St4x1) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x1(
  ; CHECK:               .param .align 4 .b8 foo_St4x1_param_0[4],
  ; CHECK:               .param .b64 foo_St4x1_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  ret void
}

define dso_local void @foo_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x2(
  ; CHECK:               .param .align 4 .b8 foo_St4x2_param_0[8],
  ; CHECK:               .param .b64 foo_St4x2_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [2 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [2 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  ret void
}

define dso_local void @foo_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x3(
  ; CHECK:               .param .align 4 .b8 foo_St4x3_param_0[12],
  ; CHECK:               .param .b64 foo_St4x3_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [3 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [3 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  ret void
}

define dso_local void @foo_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x4(
  ; CHECK:               .param .align 4 .b8 foo_St4x4_param_0[16],
  ; CHECK:               .param .b64 foo_St4x4_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12];
  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [4 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [4 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  %arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
  %4 = load i32, ptr %arrayidx.3, align 4
  %arrayidx3.3 = getelementptr inbounds [4 x i32], ptr %ret, i64 0, i64 3
  store i32 %4, ptr %arrayidx3.3, align 4
  ret void
}

define dso_local void @foo_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x5(
  ; CHECK:               .param .align 4 .b8 foo_St4x5_param_0[20],
  ; CHECK:               .param .b64 foo_St4x5_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12];
  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16];
  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [5 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [5 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  %arrayidx.3 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 3
  %4 = load i32, ptr %arrayidx.3, align 4
  %arrayidx3.3 = getelementptr inbounds [5 x i32], ptr %ret, i64 0, i64 3
  store i32 %4, ptr %arrayidx3.3, align 4
  %arrayidx.4 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 4
  %5 = load i32, ptr %arrayidx.4, align 4
  %arrayidx3.4 = getelementptr inbounds [5 x i32], ptr %ret, i64 0, i64 4
  store i32 %5, ptr %arrayidx3.4, align 4
  ret void
}

define dso_local void @foo_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x6(
  ; CHECK:               .param .align 4 .b8 foo_St4x6_param_0[24],
  ; CHECK:               .param .b64 foo_St4x6_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12];
  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16];
  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20];
  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [6 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [6 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  %arrayidx.3 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 3
  %4 = load i32, ptr %arrayidx.3, align 4
  %arrayidx3.3 = getelementptr inbounds [6 x i32], ptr %ret, i64 0, i64 3
  store i32 %4, ptr %arrayidx3.3, align 4
  %arrayidx.4 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 4
  %5 = load i32, ptr %arrayidx.4, align 4
  %arrayidx3.4 = getelementptr inbounds [6 x i32], ptr %ret, i64 0, i64 4
  store i32 %5, ptr %arrayidx3.4, align 4
  %arrayidx.5 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 5
  %6 = load i32, ptr %arrayidx.5, align 4
  %arrayidx3.5 = getelementptr inbounds [6 x i32], ptr %ret, i64 0, i64 5
  store i32 %6, ptr %arrayidx3.5, align 4
  ret void
}

define dso_local void @foo_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x7(
  ; CHECK:               .param .align 4 .b8 foo_St4x7_param_0[28],
  ; CHECK:               .param .b64 foo_St4x7_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12];
  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16];
  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20];
  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24];
  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  %arrayidx.3 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 3
  %4 = load i32, ptr %arrayidx.3, align 4
  %arrayidx3.3 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 3
  store i32 %4, ptr %arrayidx3.3, align 4
  %arrayidx.4 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 4
  %5 = load i32, ptr %arrayidx.4, align 4
  %arrayidx3.4 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 4
  store i32 %5, ptr %arrayidx3.4, align 4
  %arrayidx.5 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 5
  %6 = load i32, ptr %arrayidx.5, align 4
  %arrayidx3.5 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 5
  store i32 %6, ptr %arrayidx3.5, align 4
  %arrayidx.6 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 6
  %7 = load i32, ptr %arrayidx.6, align 4
  %arrayidx3.6 = getelementptr inbounds [7 x i32], ptr %ret, i64 0, i64 6
  store i32 %7, ptr %arrayidx3.6, align 4
  ret void
}

define dso_local void @foo_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St4x8(
  ; CHECK:               .param .align 4 .b8 foo_St4x8_param_0[32],
  ; CHECK:               .param .b64 foo_St4x8_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1];
  ; CHECK:       ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0];
  ; CHECK:       st.u32  [[[R1]]], [[R2]];
  ; CHECK:       ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4];
  ; CHECK:       st.u32  [[[R1]]+4], [[R3]];
  ; CHECK:       ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8];
  ; CHECK:       st.u32  [[[R1]]+8], [[R4]];
  ; CHECK:       ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12];
  ; CHECK:       st.u32  [[[R1]]+12], [[R5]];
  ; CHECK:       ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16];
  ; CHECK:       st.u32  [[[R1]]+16], [[R6]];
  ; CHECK:       ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20];
  ; CHECK:       st.u32  [[[R1]]+20], [[R7]];
  ; CHECK:       ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24];
  ; CHECK:       st.u32  [[[R1]]+24], [[R8]];
  ; CHECK:       ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28];
  ; CHECK:       st.u32  [[[R1]]+28], [[R9]];
  ; CHECK:       ret;
  %1 = load i32, ptr %in, align 4
  store i32 %1, ptr %ret, align 4
  %arrayidx.1 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 1
  %2 = load i32, ptr %arrayidx.1, align 4
  %arrayidx3.1 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 1
  store i32 %2, ptr %arrayidx3.1, align 4
  %arrayidx.2 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 2
  %3 = load i32, ptr %arrayidx.2, align 4
  %arrayidx3.2 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 2
  store i32 %3, ptr %arrayidx3.2, align 4
  %arrayidx.3 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 3
  %4 = load i32, ptr %arrayidx.3, align 4
  %arrayidx3.3 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 3
  store i32 %4, ptr %arrayidx3.3, align 4
  %arrayidx.4 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 4
  %5 = load i32, ptr %arrayidx.4, align 4
  %arrayidx3.4 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 4
  store i32 %5, ptr %arrayidx3.4, align 4
  %arrayidx.5 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 5
  %6 = load i32, ptr %arrayidx.5, align 4
  %arrayidx3.5 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 5
  store i32 %6, ptr %arrayidx3.5, align 4
  %arrayidx.6 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 6
  %7 = load i32, ptr %arrayidx.6, align 4
  %arrayidx3.6 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 6
  store i32 %7, ptr %arrayidx3.6, align 4
  %arrayidx.7 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 7
  %8 = load i32, ptr %arrayidx.7, align 4
  %arrayidx3.7 = getelementptr inbounds [8 x i32], ptr %ret, i64 0, i64 7
  store i32 %8, ptr %arrayidx3.7, align 4
  ret void
}

define dso_local void @foo_St8x1(ptr nocapture noundef readonly byval(%struct.St8x1) align 8 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St8x1(
  ; CHECK:               .param .align 8 .b8 foo_St8x1_param_0[8],
  ; CHECK:               .param .b64 foo_St8x1_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1];
  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0];
  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
  ; CHECK:       ret;
  %1 = load i64, ptr %in, align 8
  store i64 %1, ptr %ret, align 8
  ret void
}

define dso_local void @foo_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St8x2(
  ; CHECK:               .param .align 8 .b8 foo_St8x2_param_0[16],
  ; CHECK:               .param .b64 foo_St8x2_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1];
  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0];
  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8];
  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
  ; CHECK:       ret;
  %1 = load i64, ptr %in, align 8
  store i64 %1, ptr %ret, align 8
  %arrayidx.1 = getelementptr inbounds [2 x i64], ptr %in, i64 0, i64 1
  %2 = load i64, ptr %arrayidx.1, align 8
  %arrayidx3.1 = getelementptr inbounds [2 x i64], ptr %ret, i64 0, i64 1
  store i64 %2, ptr %arrayidx3.1, align 8
  ret void
}

define dso_local void @foo_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St8x3(
  ; CHECK:               .param .align 8 .b8 foo_St8x3_param_0[24],
  ; CHECK:               .param .b64 foo_St8x3_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1];
  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0];
  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8];
  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16];
  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
  ; CHECK:       ret;
  %1 = load i64, ptr %in, align 8
  store i64 %1, ptr %ret, align 8
  %arrayidx.1 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 1
  %2 = load i64, ptr %arrayidx.1, align 8
  %arrayidx3.1 = getelementptr inbounds [3 x i64], ptr %ret, i64 0, i64 1
  store i64 %2, ptr %arrayidx3.1, align 8
  %arrayidx.2 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 2
  %3 = load i64, ptr %arrayidx.2, align 8
  %arrayidx3.2 = getelementptr inbounds [3 x i64], ptr %ret, i64 0, i64 2
  store i64 %3, ptr %arrayidx3.2, align 8
  ret void
}

define dso_local void @foo_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
  ; CHECK-LABEL: .visible .func foo_St8x4(
  ; CHECK:               .param .align 8 .b8 foo_St8x4_param_0[32],
  ; CHECK:               .param .b64 foo_St8x4_param_1
  ; CHECK:       )
  ; CHECK:       ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1];
  ; CHECK:       ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0];
  ; CHECK:       st.u64 [[[R1]]], [[RD1]];
  ; CHECK:       ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8];
  ; CHECK:       st.u64 [[[R1]]+8], [[RD2]];
  ; CHECK:       ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16];
  ; CHECK:       st.u64 [[[R1]]+16], [[RD3]];
  ; CHECK:       ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24];
  ; CHECK:       st.u64 [[[R1]]+24], [[RD4]];
  ; CHECK:       ret;
  %1 = load i64, ptr %in, align 8
  store i64 %1, ptr %ret, align 8
  %arrayidx.1 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 1
  %2 = load i64, ptr %arrayidx.1, align 8
  %arrayidx3.1 = getelementptr inbounds [4 x i64], ptr %ret, i64 0, i64 1
  store i64 %2, ptr %arrayidx3.1, align 8
  %arrayidx.2 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 2
  %3 = load i64, ptr %arrayidx.2, align 8
  %arrayidx3.2 = getelementptr inbounds [4 x i64], ptr %ret, i64 0, i64 2
  store i64 %3, ptr %arrayidx3.2, align 8
  %arrayidx.3 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 3
  %4 = load i64, ptr %arrayidx.3, align 8
  %arrayidx3.3 = getelementptr inbounds [4 x i64], ptr %ret, i64 0, i64 3
  store i64 %4, ptr %arrayidx3.3, align 8
  ret void
}