llvm/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN

; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F

; The two loads are contigous and should be folded into one
define void @widen_2xv4i16(ptr %x, ptr %z) {
; CHECK-LABEL: widen_2xv4i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vle64.v v8, (a0)
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 8
  %b = load <4 x i16>, ptr %b.gep
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

define void @widen_3xv4i16(ptr %x, ptr %z) {
; CHECK-LABEL: widen_3xv4i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vle16.v v8, (a0)
; CHECK-NEXT:    addi a2, a0, 8
; CHECK-NEXT:    vle16.v v9, (a2)
; CHECK-NEXT:    addi a0, a0, 16
; CHECK-NEXT:    vle16.v v10, (a0)
; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT:    vslideup.vi v8, v9, 4
; CHECK-NEXT:    vsetivli zero, 12, e16, m2, ta, ma
; CHECK-NEXT:    vslideup.vi v8, v10, 8
; CHECK-NEXT:    vse16.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 8
  %b = load <4 x i16>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 8
  %c = load <4 x i16>, ptr %c.gep
  %d.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %d.1 = shufflevector <4 x i16> %c, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
  %d.2 = shufflevector <8 x i16> %d.0, <8 x i16> %d.1, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
  store <12 x i16> %d.2, ptr %z
  ret void
}

define void @widen_4xv4i16(ptr %x, ptr %z) {
; CHECK-LABEL: widen_4xv4i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vle64.v v8, (a0)
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 8
  %b = load <4 x i16>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 8
  %c = load <4 x i16>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 8
  %d = load <4 x i16>, ptr %d.gep
  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %e.2, ptr %z
  ret void
}

define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
; CHECK-NO-MISALIGN:       # %bb.0:
; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
; CHECK-NO-MISALIGN-NEXT:    addi a3, a0, 16
; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a3)
; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
; CHECK-NO-MISALIGN-NEXT:    vle8.v v11, (a2)
; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v10, v9, 4
; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v11, 4
; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 8
; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
; CHECK-NO-MISALIGN-NEXT:    ret
;
; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
; RV64-MISALIGN:       # %bb.0:
; RV64-MISALIGN-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; RV64-MISALIGN-NEXT:    vle64.v v8, (a0)
; RV64-MISALIGN-NEXT:    vse64.v v8, (a1)
; RV64-MISALIGN-NEXT:    ret
  %a = load <4 x i16>, ptr %x, align 1
  %b.gep = getelementptr i8, ptr %x, i64 8
  %b = load <4 x i16>, ptr %b.gep, align 1
  %c.gep = getelementptr i8, ptr %b.gep, i64 8
  %c = load <4 x i16>, ptr %c.gep, align 1
  %d.gep = getelementptr i8, ptr %c.gep, i64 8
  %d = load <4 x i16>, ptr %d.gep, align 1
  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %e.2, ptr %z
  ret void
}

; Should be a strided load - with type coercion to i64
define void @strided_constant(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant:
; CHECK:       # %bb.0:
; CHECK-NEXT:    li a2, 16
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 16
  %b = load <4 x i16>, ptr %b.gep
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Should be a strided load
define void @strided_constant_64(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant_64:
; CHECK:       # %bb.0:
; CHECK-NEXT:    li a2, 64
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 64
  %b = load <4 x i16>, ptr %b.gep
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Vector is too large to fit into a single strided load
define void @strided_constant_v4i32(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant_v4i32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vle32.v v8, (a0)
; CHECK-NEXT:    addi a0, a0, 32
; CHECK-NEXT:    vle32.v v10, (a0)
; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT:    vslideup.vi v8, v10, 4
; CHECK-NEXT:    vse32.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i32>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 32
  %b = load <4 x i32>, ptr %b.gep
  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i32> %c, ptr %z
  ret void
}

; Interestingly, can be a stride 0 load
define void @strided_constant_0(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant_0:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vle16.v v8, (a0)
; CHECK-NEXT:    vmv1r.v v9, v8
; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT:    vslideup.vi v9, v8, 4
; CHECK-NEXT:    vse16.v v9, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b = load <4 x i16>, ptr %x
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Stride isn't consistent, so shouldn't be combined
define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
; CHECK-LABEL: strided_constant_mismatch_4xv4i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vle16.v v8, (a0)
; CHECK-NEXT:    addi a2, a0, 2
; CHECK-NEXT:    addi a3, a0, 6
; CHECK-NEXT:    vle16.v v10, (a3)
; CHECK-NEXT:    addi a0, a0, 8
; CHECK-NEXT:    vle16.v v9, (a0)
; CHECK-NEXT:    vle16.v v11, (a2)
; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT:    vslideup.vi v10, v9, 4
; CHECK-NEXT:    vslideup.vi v8, v11, 4
; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT:    vslideup.vi v8, v10, 8
; CHECK-NEXT:    vse16.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 2
  %b = load <4 x i16>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 4
  %c = load <4 x i16>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 2
  %d = load <4 x i16>, ptr %d.gep
  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %e.2, ptr %z
  ret void
}

define void @strided_runtime(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_runtime:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_runtime_4xv4i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
  %c = load <4 x i16>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
  %d = load <4 x i16>, ptr %d.gep
  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %e.2, ptr %z
  ret void
}

; Stride isn't consistent, so shouldn't be combined
define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
; RV32-LABEL: strided_runtime_mismatch_4xv4i16:
; RV32:       # %bb.0:
; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; RV32-NEXT:    vle16.v v8, (a0)
; RV32-NEXT:    add a0, a0, a2
; RV32-NEXT:    add a4, a0, a4
; RV32-NEXT:    vle16.v v10, (a4)
; RV32-NEXT:    add a2, a4, a2
; RV32-NEXT:    vle16.v v9, (a2)
; RV32-NEXT:    vle16.v v11, (a0)
; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT:    vslideup.vi v10, v9, 4
; RV32-NEXT:    vslideup.vi v8, v11, 4
; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
; RV32-NEXT:    vslideup.vi v8, v10, 8
; RV32-NEXT:    vse16.v v8, (a1)
; RV32-NEXT:    ret
;
; RV64-LABEL: strided_runtime_mismatch_4xv4i16:
; RV64:       # %bb.0:
; RV64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; RV64-NEXT:    vle16.v v8, (a0)
; RV64-NEXT:    add a0, a0, a2
; RV64-NEXT:    add a3, a0, a3
; RV64-NEXT:    vle16.v v10, (a3)
; RV64-NEXT:    add a2, a3, a2
; RV64-NEXT:    vle16.v v9, (a2)
; RV64-NEXT:    vle16.v v11, (a0)
; RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; RV64-NEXT:    vslideup.vi v10, v9, 4
; RV64-NEXT:    vslideup.vi v8, v11, 4
; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT:    vslideup.vi v8, v10, 8
; RV64-NEXT:    vse16.v v8, (a1)
; RV64-NEXT:    ret
;
; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16:
; ZVE64F:       # %bb.0:
; ZVE64F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; ZVE64F-NEXT:    vle16.v v8, (a0)
; ZVE64F-NEXT:    add a0, a0, a2
; ZVE64F-NEXT:    add a3, a0, a3
; ZVE64F-NEXT:    vle16.v v10, (a3)
; ZVE64F-NEXT:    add a2, a3, a2
; ZVE64F-NEXT:    vle16.v v9, (a2)
; ZVE64F-NEXT:    vle16.v v11, (a0)
; ZVE64F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; ZVE64F-NEXT:    vslideup.vi v10, v9, 4
; ZVE64F-NEXT:    vslideup.vi v8, v11, 4
; ZVE64F-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
; ZVE64F-NEXT:    vslideup.vi v8, v10, 8
; ZVE64F-NEXT:    vse16.v v8, (a1)
; ZVE64F-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 %t
  %c = load <4 x i16>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
  %d = load <4 x i16>, ptr %d.gep
  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %e.2, ptr %z
  ret void
}

define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_runtime_4xv4f16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x half>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x half>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
  %c = load <4 x half>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
  %d = load <4 x half>, ptr %d.gep
  %e.0 = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.1 = shufflevector <4 x half> %c, <4 x half> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %e.2 = shufflevector <8 x half> %e.0, <8 x half> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x half> %e.2, ptr %z
  ret void
}

define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_runtime_4xv2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <2 x float>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <2 x float>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 %s
  %c = load <2 x float>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 %s
  %d = load <2 x float>, ptr %d.gep
  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %e.2, ptr %z
  ret void
}

define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
; CHECK-NO-MISALIGN:       # %bb.0:
; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
; CHECK-NO-MISALIGN-NEXT:    add a0, a0, a2
; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v9, 4
; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
; CHECK-NO-MISALIGN-NEXT:    ret
;
; RV64-MISALIGN-LABEL: strided_unaligned:
; RV64-MISALIGN:       # %bb.0:
; RV64-MISALIGN-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; RV64-MISALIGN-NEXT:    vlse64.v v8, (a0), a2
; RV64-MISALIGN-NEXT:    vse64.v v8, (a1)
; RV64-MISALIGN-NEXT:    ret
  %a = load <4 x i16>, ptr %x, align 1
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep, align 1
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Should use the most restrictive common alignment
define void @strided_mismatched_alignments(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_mismatched_alignments:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x, align 8
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep, align 16
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

define void @strided_ok_alignments_8(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_ok_alignments_8:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x, align 8
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep, align 8
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

define void @strided_ok_alignments_16(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_ok_alignments_16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x, align 16
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load <4 x i16>, ptr %b.gep, align 16
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Shouldn't be combined because one of the loads is not simple
define void @strided_non_simple_load(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_non_simple_load:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vle16.v v8, (a0)
; CHECK-NEXT:    add a0, a0, a2
; CHECK-NEXT:    vle16.v v9, (a0)
; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT:    vslideup.vi v8, v9, 4
; CHECK-NEXT:    vse16.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 %s
  %b = load volatile <4 x i16>, ptr %b.gep
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

; Shouldn't be combined because one of the operands is not a load
define void @strided_non_load(ptr %x, ptr %z, <4 x i16> %b) {
; CHECK-LABEL: strided_non_load:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vle16.v v9, (a0)
; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT:    vslideup.vi v9, v8, 4
; CHECK-NEXT:    vse16.v v9, (a1)
; CHECK-NEXT:    ret
  %a = load <4 x i16>, ptr %x
  %c = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %c, ptr %z
  ret void
}

define void @strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: strided_constant_neg_4xv2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    li a2, -64
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %a = load <2 x float>, ptr %x
  %b.gep = getelementptr i8, ptr %x, i64 -64
  %b = load <2 x float>, ptr %b.gep
  %c.gep = getelementptr i8, ptr %b.gep, i64 -64
  %c = load <2 x float>, ptr %c.gep
  %d.gep = getelementptr i8, ptr %c.gep, i64 -64
  %d = load <2 x float>, ptr %d.gep
  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %e.2, ptr %z
  ret void
}

; This is a strided load with a negative stride
define void @reverse_strided_constant_pos_4xv2f32(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: reverse_strided_constant_pos_4xv2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    addi a0, a0, 192
; CHECK-NEXT:    li a2, -64
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %x.1 = getelementptr i8, ptr %x, i64 64
  %x.2 = getelementptr i8, ptr %x.1, i64 64
  %x.3 = getelementptr i8, ptr %x.2, i64 64
  %a = load <2 x float>, ptr %x.3
  %b = load <2 x float>, ptr %x.2
  %c = load <2 x float>, ptr %x.1
  %d = load <2 x float>, ptr %x
  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %e.2, ptr %z
  ret void
}

define void @reverse_strided_constant_neg_4xv2f32(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: reverse_strided_constant_neg_4xv2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    addi a0, a0, -192
; CHECK-NEXT:    li a2, 64
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %x.1 = getelementptr i8, ptr %x, i64 -64
  %x.2 = getelementptr i8, ptr %x.1, i64 -64
  %x.3 = getelementptr i8, ptr %x.2, i64 -64
  %a = load <2 x float>, ptr %x.3
  %b = load <2 x float>, ptr %x.2
  %c = load <2 x float>, ptr %x.1
  %d = load <2 x float>, ptr %x
  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %e.2, ptr %z
  ret void
}

; This is a strided load with a negative stride
define void @reverse_strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
; CHECK-LABEL: reverse_strided_runtime_4xv2f32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    add a0, a0, a2
; CHECK-NEXT:    add a3, a2, a2
; CHECK-NEXT:    add a0, a0, a3
; CHECK-NEXT:    neg a2, a2
; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT:    vlse64.v v8, (a0), a2
; CHECK-NEXT:    vse64.v v8, (a1)
; CHECK-NEXT:    ret
  %x.1 = getelementptr i8, ptr %x, i64 %s
  %x.2 = getelementptr i8, ptr %x.1, i64 %s
  %x.3 = getelementptr i8, ptr %x.2, i64 %s
  %a = load <2 x float>, ptr %x.3
  %b = load <2 x float>, ptr %x.2
  %c = load <2 x float>, ptr %x.1
  %d = load <2 x float>, ptr %x
  %e.0 = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.1 = shufflevector <2 x float> %c, <2 x float> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %e.2 = shufflevector <4 x float> %e.0, <4 x float> %e.1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %e.2, ptr %z
  ret void
}

; The middle end sometimes produces this pattern of shuffles, where the
; intermediate shuffles are the full result vector size padded with poison
; elements.
define <16 x i8> @widen_4xv4i8_immediate_expand(ptr %p, i64 %s) {
; CHECK-LABEL: widen_4xv4i8_immediate_expand:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT:    vlse32.v v8, (a0), a1
; CHECK-NEXT:    ret
  %a = load <4 x i8>, ptr %p
  %b.ptr = getelementptr i8, ptr %p, i64 %s
  %b = load <4 x i8>, ptr %b.ptr
  %c.ptr = getelementptr i8, ptr %b.ptr, i64 %s
  %c = load <4 x i8>, ptr %c.ptr
  %d.ptr = getelementptr i8, ptr %c.ptr, i64 %s
  %d = load <4 x i8>, ptr %d.ptr

  %ab = shufflevector <4 x i8> %a, <4 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %cx = shufflevector <4 x i8> %c, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %dx = shufflevector <4 x i8> %d, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %abcx = shufflevector <16 x i8> %ab, <16 x i8> %cx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
  %abcd = shufflevector <16 x i8> %abcx, <16 x i8> %dx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
  ret <16 x i8> %abcd
}