llvm/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-multivec-stores.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s

;
; ST2Q
;
define void @st2q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8>%v0, <vscale x 16 x i8> %v1 ,
                                           <vscale x 16 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st2q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> %v0,
                                          <vscale x 8 x i16> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %v0,
                                          <vscale x 4 x i32> %v1,
                                          <vscale x 4 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> %v0,
                                          <vscale x 2 x i64> %v1,
                                          <vscale x 2 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> %v0,
                                          <vscale x 8 x half> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> %v0,
                                          <vscale x 4 x float> %v1,
                                          <vscale x 4 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> %v0,
                                          <vscale x 2 x double> %v1,
                                          <vscale x 2 x i1> %pred,
                                          ptr %1)
  ret void
}

define void @st2q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st2q_ss_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                          <vscale x 8 x bfloat> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %1)
  ret void
}


define void @st2q_si_i8_off16(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st2q_si_i8_off16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #-16, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -16
  call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st2q_si_i8_off14(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st2q_si_i8_off14:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 14
  call void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st2q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 8 x i16>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16> %v0,
                                          <vscale x 8 x i16> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 4 x i32>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32> %v0,
                                          <vscale x 4 x i32> %v1,
                                          <vscale x 4 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 2 x i64>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64> %v0,
                                          <vscale x 2 x i64> %v1,
                                          <vscale x 2 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 8 x half>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half> %v0,
                                          <vscale x 8 x half> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 4 x float>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float> %v0,
                                          <vscale x 4 x float> %v1,
                                          <vscale x 4 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep= getelementptr <vscale x 2 x double>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double> %v0,
                                          <vscale x 2 x double> %v1,
                                          <vscale x 2 x i1> %pred,
                                          ptr %gep)
  ret void
}

define void @st2q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x i1> %pred, ptr %base) {
; CHECK-LABEL: st2q_si_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT:    st2q { z0.q, z1.q }, p0, [x0, #14, mul vl]
; CHECK-NEXT:    ret
  %gep = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 14
  call void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                          <vscale x 8 x bfloat> %v1,
                                          <vscale x 8 x i1> %pred,
                                          ptr %gep)
  ret void
}


;
; ST3Q
;
define void @st3q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8>%v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2,  <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st3q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st3q_ss_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                            <vscale x 8 x bfloat> %v1,
                                            <vscale x 8 x bfloat> %v2,
                                            <vscale x 8 x i1> %pred,
                                            ptr %1)
  ret void
}

define void @st3q_si_i8_off24(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_i8_off24:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #-24, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -24
  call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_i8_off21(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_i8_off21:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2,  <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x i16>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 4 x i32>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1,<vscale x 2 x i64> %v2, <vscale x 2 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 2 x i64>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x half>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 4 x float>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 2 x double>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st3q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st3q_si_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
; CHECK-NEXT:    st3q { z0.q - z2.q }, p0, [x0, #21, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x bfloat>, ptr %addr, i64 21
  call void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                            <vscale x 8 x bfloat> %v1,
                                            <vscale x 8 x bfloat> %v2,
                                            <vscale x 8 x i1> %pred,
                                            ptr %base)
  ret void
}

;
; ST4Q
;
define void @st4q_ss_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8>%v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i16> %v3,
                                           <vscale x 8 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i32> %v3,
                                           <vscale x 4 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i64> %v3,
                                           <vscale x 2 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x half> %v3,
                                           <vscale x 8 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x float> %v3,
                                           <vscale x 4 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x double> %v3,
                                           <vscale x 2 x i1> %pred,
                                           ptr %1)
  ret void
}

define void @st4q_ss_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, ptr %addr, i64 %offset) {
; CHECK-LABEL: st4q_ss_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, x1, lsl #4]
; CHECK-NEXT:    ret
  %1 = getelementptr i128, ptr %addr, i64 %offset
  call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                            <vscale x 8 x bfloat> %v1,
                                            <vscale x 8 x bfloat> %v2,
                                            <vscale x 8 x bfloat> %v3,
                                            <vscale x 8 x i1> %pred,
                                            ptr %1)
  ret void
}

define void @st4q_si_i8_off32(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_i8_off32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #-32, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 -32
  call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_i8_off28(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2,<vscale x 16 x i8> %v3, <vscale x 16 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_i8_off28:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 16 x i8>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8> %v0,
                                           <vscale x 16 x i8> %v1,
                                           <vscale x 16 x i8> %v2,
                                           <vscale x 16 x i8> %v3,
                                           <vscale x 16 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_i16(<vscale x 8 x i16> %v0, <vscale x 8 x i16> %v1, <vscale x 8 x i16> %v2, <vscale x 8 x i16> %v3,  <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x i16>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16> %v0,
                                           <vscale x 8 x i16> %v1,
                                           <vscale x 8 x i16> %v2,
                                           <vscale x 8 x i16> %v3,
                                           <vscale x 8 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1, <vscale x 4 x i32> %v2, <vscale x 4 x i32> %v3, <vscale x 4 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base1 = getelementptr <vscale x 4 x i32>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32> %v0,
                                           <vscale x 4 x i32> %v1,
                                           <vscale x 4 x i32> %v2,
                                           <vscale x 4 x i32> %v3,
                                           <vscale x 4 x i1> %pred,
                                           ptr %base1)
  ret void
}

define void @st4q_si_i64(<vscale x 2 x i64> %v0, <vscale x 2 x i64> %v1, <vscale x 2 x i64> %v2, <vscale x 2 x i64> %v3, <vscale x 2 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 2 x i64>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64> %v0,
                                           <vscale x 2 x i64> %v1,
                                           <vscale x 2 x i64> %v2,
                                           <vscale x 2 x i64> %v3,
                                           <vscale x 2 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x half>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half> %v0,
                                           <vscale x 8 x half> %v1,
                                           <vscale x 8 x half> %v2,
                                           <vscale x 8 x half> %v3,
                                           <vscale x 8 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2,<vscale x 4 x float> %v3,  <vscale x 4 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 4 x float>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float> %v0,
                                           <vscale x 4 x float> %v1,
                                           <vscale x 4 x float> %v2,
                                           <vscale x 4 x float> %v3,
                                           <vscale x 4 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 2 x double>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double> %v0,
                                           <vscale x 2 x double> %v1,
                                           <vscale x 2 x double> %v2,
                                           <vscale x 2 x double> %v3,
                                           <vscale x 2 x i1> %pred,
                                           ptr %base)
  ret void
}

define void @st4q_si_bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x i1> %pred, ptr %addr) {
; CHECK-LABEL: st4q_si_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT:    st4q { z0.q - z3.q }, p0, [x0, #28, mul vl]
; CHECK-NEXT:    ret
  %base = getelementptr <vscale x 8 x bfloat>, ptr %addr, i64 28
  call void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat> %v0,
                                            <vscale x 8 x bfloat> %v1,
                                            <vscale x 8 x bfloat> %v2,
                                            <vscale x 8 x bfloat> %v3,
                                            <vscale x 8 x i1> %pred,
                                            ptr %base)
  ret void
}


declare void @llvm.aarch64.sve.st2q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)

declare void @llvm.aarch64.sve.st2q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
declare void @llvm.aarch64.sve.st2q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)

declare void @llvm.aarch64.sve.st3q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)

declare void @llvm.aarch64.sve.st3q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
declare void @llvm.aarch64.sve.st3q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)

declare void @llvm.aarch64.sve.st4q.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, ptr)

declare void @llvm.aarch64.sve.st4q.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, ptr)
declare void @llvm.aarch64.sve.st4q.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, ptr)