llvm/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

;MOVAZ (tile to vector, Multi)


;;
; X2 - Horiz
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x2(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z8_i8_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 0:1]
; CHECK-NEXT:    movaz { z0.b, z1.b }, za0h.b[w12, 14:15]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 14
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_i16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_i32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 2
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 3, i32 %slice.max)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_i64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 7, i32 %slice)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_bf16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_f16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0h.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1h.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_f32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s, z1.s }, za0h.s[w12, 0:1]
; CHECK-NEXT:    movaz { z0.s, z1.s }, za3h.s[w12, 2:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 2
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 %slice.max)
  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_f64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za0h.d[w12, 0:1]
; CHECK-NEXT:    movaz { z2.d, z3.d }, za7h.d[w12, 0:1]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 %slice)
  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
}

;;
; X2- Vert
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x2(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z8_i8_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 0:1]
; CHECK-NEXT:    movaz { z0.b, z1.b }, za0v.b[w12, 14:15]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 14
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_i16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_i32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 2
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 3, i32 %slice.max)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_i64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 7, i32 %slice)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_bf16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_f16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h, z1.h }, za0v.h[w12, 0:1]
; CHECK-NEXT:    movaz { z0.h, z1.h }, za1v.h[w12, 6:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 6
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_f32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s, z1.s }, za0v.s[w12, 0:1]
; CHECK-NEXT:    movaz { z0.s, z1.s }, za3v.s[w12, 2:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 2
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 %slice.max)
  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_f64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za0v.d[w12, 0:1]
; CHECK-NEXT:    movaz { z2.d, z3.d }, za7v.d[w12, 0:1]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 %slice)
  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res
}

;;
; X4 - Horiz
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_hor_z8_i8_x4(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z8_i8_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 0:3]
; CHECK-NEXT:    movaz { z0.b - z3.b }, za0h.b[w12, 12:15]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 12
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_hor_z16_i16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_i16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_hor_z32_i32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_i32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 %slice)
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 3, i32 %slice)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_hor_z64_i64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_i64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 7, i32 %slice)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_hor_z16_bf16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_bf16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_hor_z16_f16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_f16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0h.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1h.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_hor_z32_f32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_f32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s - z3.s }, za0h.s[w12, 0:3]
; CHECK-NEXT:    movaz { z0.s - z3.s }, za3h.s[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 0, i32 %slice)
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 %slice)
  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_hor_z64_f64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_f64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za0h.d[w12, 0:3]
; CHECK-NEXT:    movaz { z4.d - z7.d }, za7h.d[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 %slice)
  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
}

;;
; X4 - Vert
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_ver_z8_i8_x4(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z8_i8_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 0:3]
; CHECK-NEXT:    movaz { z0.b - z3.b }, za0v.b[w12, 12:15]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 12
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_ver_z16_i16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_i16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_ver_z32_i32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_i32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 %slice)
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 3, i32 %slice)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_ver_z64_i64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_i64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 7, i32 %slice)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} %res
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_ver_z16_bf16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_bf16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_ver_z16_f16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_f16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.h - z3.h }, za0v.h[w12, 0:3]
; CHECK-NEXT:    movaz { z0.h - z3.h }, za1v.h[w12, 4:7]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 4
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 1, i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_ver_z32_f32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_f32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.s - z3.s }, za0v.s[w12, 0:3]
; CHECK-NEXT:    movaz { z0.s - z3.s }, za3v.s[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 0, i32 %slice)
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 %slice)
  ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_ver_z64_f64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_f64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za0v.d[w12, 0:3]
; CHECK-NEXT:    movaz { z4.d - z7.d }, za7v.d[w12, 0:3]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 0, i32 %slice)
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 %slice)
  ret {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} %res
}


declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x2.nxv16i8(i32, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32, i32)
declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32, i32)
declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32, i32)

declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x2.nxv16i8(i32, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32, i32)
declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32, i32)
declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32, i32)

declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.horiz.za8.x4.nxv16i8(i32, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32, i32)
declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32, i32)
declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32, i32)

declare {<vscale x 16 x i8>, <vscale x 16 x i8>,<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.vert.za8.x4.nxv16i8(i32, i32)
declare {<vscale x 8 x i16>, <vscale x 8 x i16>,<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32, i32)
declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32, i32)
declare {<vscale x 2 x i64>, <vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32, i32)
declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32, i32)
declare {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32, i32)
declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32, i32)
declare {<vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32, i32)

;MOVAZ (tile to vector, single)

;;
; Horiz
;;
define <vscale x 16 x i8> @test_readz_hor_z8_i8(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z8_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.b, za0h.b[w12, 0]
; CHECK-NEXT:    movaz z0.b, za0h.b[w12, 14]
; CHECK-NEXT:    ret
  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 14
  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice.max)
  ret <vscale x 16 x i8> %res2
}

define <vscale x 8 x i16> @test_readz_hor_z16_i16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 %slice.max)
  ret <vscale x 8 x i16> %res2
}

define <vscale x 4 x i32> @test_readz_hor_z32_i32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.s, za0h.s[w12, 0]
; CHECK-NEXT:    movaz z0.s, za3h.s[w12, 3]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 3
  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 3, i32 %slice.max)
  ret <vscale x 4 x i32> %res2
}

define <vscale x 2 x i64> @test_readz_hor_z64_i64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.d, za0h.d[w12, 0]
; CHECK-NEXT:    movaz z1.d, za7h.d[w12, 1]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 1
  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 7, i32 %slice.max)
  ret <vscale x 2 x i64> %res
}

define <vscale x 8 x bfloat> @test_readz_hor_z16_bf16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 %slice.max)
  ret <vscale x 8 x bfloat> %res2
}

define <vscale x 8 x half> @test_readz_hor_z16_f16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z16_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0h.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1h.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 1, i32 %slice.max)
  ret <vscale x 8 x half> %res2
}

define <vscale x 4 x float> @test_readz_hor_z32_f32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z32_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.s, za0h.s[w12, 0]
; CHECK-NEXT:    movaz z0.s, za3h.s[w12, 3]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 3
  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 %slice.max)
  ret <vscale x 4 x float> %res2
}

define <vscale x 2 x double> @test_readz_hor_z64_f64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z64_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.d, za0h.d[w12, 0]
; CHECK-NEXT:    movaz z1.d, za7h.d[w12, 1]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 1
  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 %slice.max)
  ret <vscale x 2 x double> %res
}

define <vscale x 16 x i8> @test_readz_hor_z128_i8(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 %slice)
  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 15, i32 %slice)
  ret <vscale x 16 x i8> %res2
}

define <vscale x 8 x i16> @test_readz_hor_z128_i16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 15, i32 %slice)
  ret <vscale x 8 x i16> %res2
}

define <vscale x 4 x i32> @test_readz_hor_z128_i32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 0, i32 %slice)
  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 15, i32 %slice)
  ret <vscale x 4 x i32> %res2
}

define <vscale x 2 x i64> @test_readz_hor_z128_i64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z1.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 0, i32 %slice)
  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 15, i32 %slice)
  ret <vscale x 2 x i64> %res
}

define <vscale x 8 x bfloat> @test_readz_hor_z128_bf16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 15, i32 %slice)
  ret <vscale x 8 x bfloat> %res2
}

define <vscale x 8 x half> @test_readz_hor_z128_f16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 15, i32 %slice)
  ret <vscale x 8 x half> %res2
}

define <vscale x 4 x float> @test_readz_hor_z128_f32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 0, i32 %slice)
  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 15, i32 %slice)
  ret <vscale x 4 x float> %res2
}

define <vscale x 2 x double> @test_readz_hor_z128_f64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_hor_z128_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0h.q[w12, 0]
; CHECK-NEXT:    movaz z1.q, za15h.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 0, i32 %slice)
  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 %slice)
  ret <vscale x 2 x double> %res
}

;;
; Vert
;;
define <vscale x 16 x i8> @test_readz_ver_z8_i8(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z8_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.b, za0v.b[w12, 0]
; CHECK-NEXT:    movaz z0.b, za0v.b[w12, 14]
; CHECK-NEXT:    ret
  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 14
  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice.max)
  ret <vscale x 16 x i8> %res2
}

define <vscale x 8 x i16> @test_readz_ver_z16_i16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 1, i32 %slice.max)
  ret <vscale x 8 x i16> %res2
}

define <vscale x 4 x i32> @test_readz_ver_z32_i32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.s, za0v.s[w12, 0]
; CHECK-NEXT:    movaz z0.s, za3v.s[w12, 3]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 3
  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 3, i32 %slice.max)
  ret <vscale x 4 x i32> %res2
}

define <vscale x 2 x i64> @test_readz_ver_z64_i64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.d, za0v.d[w12, 0]
; CHECK-NEXT:    movaz z1.d, za7v.d[w12, 1]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 1
  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 7, i32 %slice.max)
  ret <vscale x 2 x i64> %res
}

define <vscale x 8 x bfloat> @test_readz_ver_z16_bf16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 1, i32 %slice.max)
  ret <vscale x 8 x bfloat> %res2
}

define <vscale x 8 x half> @test_readz_ver_z16_f16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z16_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.h, za0v.h[w12, 0]
; CHECK-NEXT:    movaz z0.h, za1v.h[w12, 7]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 1, i32 %slice.max)
  ret <vscale x 8 x half> %res2
}

define <vscale x 4 x float> @test_readz_ver_z32_f32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z32_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.s, za0v.s[w12, 0]
; CHECK-NEXT:    movaz z0.s, za3v.s[w12, 3]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 3
  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 3, i32 %slice.max)
  ret <vscale x 4 x float> %res2
}

define <vscale x 2 x double> @test_readz_ver_z64_f64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z64_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.d, za0v.d[w12, 0]
; CHECK-NEXT:    movaz z1.d, za7v.d[w12, 1]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 0, i32 %slice)
  %slice.max = add i32 %slice, 1
  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 7, i32 %slice.max)
  ret <vscale x 2 x double> %res
}

define <vscale x 16 x i8> @test_readz_ver_z128_i8(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 0, i32 %slice)
  %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 15, i32 %slice)
  ret <vscale x 16 x i8> %res2
}

define <vscale x 8 x i16> @test_readz_ver_z128_i16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 15, i32 %slice)
  ret <vscale x 8 x i16> %res2
}

define <vscale x 4 x i32> @test_readz_ver_z128_i32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 0, i32 %slice)
  %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 15, i32 %slice)
  ret <vscale x 4 x i32> %res2
}

define <vscale x 2 x i64> @test_readz_ver_z128_i64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z1.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 0, i32 %slice)
  %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 15, i32 %slice)
  ret <vscale x 2 x i64> %res
}

define <vscale x 8 x bfloat> @test_readz_ver_z128_bf16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_bf16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 15, i32 %slice)
  ret <vscale x 8 x bfloat> %res2
}

define <vscale x 8 x half> @test_readz_ver_z128_f16(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_f16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 0, i32 %slice)
  %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 15, i32 %slice)
  ret <vscale x 8 x half> %res2
}

define <vscale x 4 x float> @test_readz_ver_z128_f32(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_f32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z0.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 0, i32 %slice)
  %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 15, i32 %slice)
  ret <vscale x 4 x float> %res2
}

define <vscale x 2 x double> @test_readz_ver_z128_f64(i32 %tile, i32 %slice) #0 {
; CHECK-LABEL: test_readz_ver_z128_f64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w12, w1
; CHECK-NEXT:    movaz z0.q, za0v.q[w12, 0]
; CHECK-NEXT:    movaz z1.q, za15v.q[w12, 0]
; CHECK-NEXT:    ret
  %res = call  <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 0, i32 %slice)
  %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 15, i32 %slice)
  ret <vscale x 2 x double> %res
}

declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32, i32)
declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32, i32)
declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32, i32)
declare <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32, i32)
declare <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32, i32)
declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32, i32)
declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32, i32)
declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32, i32)
declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32, i32)
declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32, i32)


declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32, i32)
declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32, i32)
declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32, i32)
declare <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32, i32)
declare <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32, i32)
declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32, i32)
declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32, i32)
declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32, i32)
declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32, i32)
declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32, i32)
declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32, i32)

;MOVAZ (array to vector, Multi)


;;
; X2
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z8_i8_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x2.nxv16i8(i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}

define {<vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_i16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x2.nxv8i16(i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z32_i32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x2.nxv4i32(i32 %slice.max)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z64_i64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x2.nxv2i64(i32 %slice.max)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %res2
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_bf16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x2.nxv8bf16(i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_f16_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x2.nxv8f16(i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z32_f32_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x2.nxv4f32(i32 %slice.max)
  ret {<vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x2(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z64_f64_x2:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 0, vgx2]
; CHECK-NEXT:    movaz { z0.d, z1.d }, za.d[w8, 7, vgx2]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x2.nxv2f64(i32 %slice.max)
  ret {<vscale x 2 x double>, <vscale x 2 x double>} %res2
}

;;
; X4
;;

define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_readz_z8_i8_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z8_i8_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.readz.x4.nxv16i8(i32 %slice.max)
  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res2
}

define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @test_readz_z16_i16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_i16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.readz.x4.nxv8i16(i32 %slice.max)
  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %res2
}

define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @test_readz_z32_i32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z32_i32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.readz.x4.nxv4i32(i32 %slice.max)
  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res2
}

define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @test_readz_z64_i64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z64_i64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.aarch64.sme.readz.x4.nxv2i64(i32 %slice.max)
  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %res2
}

define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @test_readz_z16_bf16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_bf16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.readz.x4.nxv8bf16(i32 %slice.max)
  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res2
}

define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @test_readz_z16_f16_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z16_f16_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.aarch64.sme.readz.x4.nxv8f16(i32 %slice.max)
  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res2
}

define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @test_readz_z32_f32_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z32_f32_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.readz.x4.nxv4f32(i32 %slice.max)
  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res2
}

define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @test_readz_z64_f64_x4(i32 %slice) #0 {
; CHECK-LABEL: test_readz_z64_f64_x4:
; CHECK:       // %bb.0:
; CHECK-NEXT:    mov w8, w0
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 0, vgx4]
; CHECK-NEXT:    movaz { z0.d - z3.d }, za.d[w8, 7, vgx4]
; CHECK-NEXT:    ret
  %res = call  {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice)
  %slice.max = add i32 %slice, 7
  %res2 = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.aarch64.sme.readz.x4.nxv2f64(i32 %slice.max)
  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res2
}

attributes #0 = { "target-features"="+sme2p1" }