llvm/llvm/test/CodeGen/AArch64/shufflevector.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

; ===== Legal Vector Types =====

define <8 x i8> @shufflevector_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: shufflevector_v8i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    adrp x8, .LCPI0_0
; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
; CHECK-SD-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v8i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    adrp x8, .LCPI0_0
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
    ret <8 x i8> %c
}

define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: shufflevector_v16i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    adrp x8, .LCPI1_0
; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v16i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    adrp x8, .LCPI1_0
; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
    ret <16 x i8> %c
}

define <4 x i16> @shufflevector_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shufflevector_v4i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    uzp2 v0.4h, v0.4h, v1.4h
; CHECK-NEXT:    ret
    %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    ret <4 x i16> %c
}

define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SD-LABEL: shufflevector_v8i16:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    adrp x8, .LCPI3_0
; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v8i16:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    adrp x8, .LCPI3_0
; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
    ret <8 x i16> %c
}

define <2 x i32> @shufflevector_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: shufflevector_v2i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    zip2 v0.2s, v0.2s, v1.2s
; CHECK-NEXT:    ret
    %c = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
    ret <2 x i32> %c
}

define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: shufflevector_v4i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT:    ret
    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    ret <4 x i32> %c
}

define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: shufflevector_v2i64:
; CHECK:       // %bb.0:
; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
; CHECK-NEXT:    ret
    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
    ret <2 x i64> %c
}

; ===== Legal Vector Types with Zero Masks =====

define <8 x i8> @shufflevector_v8i8_zeroes(<8 x i8> %a, <8 x i8> %b) {
; CHECK-LABEL: shufflevector_v8i8_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v0.8b, v0.b[0]
; CHECK-NEXT:    ret
    %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <8 x i8> %c
}

define <16 x i8> @shufflevector_v16i8_zeroes(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: shufflevector_v16i8_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.16b, v0.b[0]
; CHECK-NEXT:    ret
    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <16 x i8> %c
}

define <4 x i16> @shufflevector_v4i16_zeroes(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: shufflevector_v4i16_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v0.4h, v0.h[0]
; CHECK-NEXT:    ret
    %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
    ret <4 x i16> %c
}

define <8 x i16> @shufflevector_v8i16_zeroes(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: shufflevector_v8i16_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.8h, v0.h[0]
; CHECK-NEXT:    ret
    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <8 x i16> %c
}

define <2 x i32> @shufflevector_v2i32_zeroes(<2 x i32> %a, <2 x i32> %b) {
; CHECK-LABEL: shufflevector_v2i32_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v0.2s, v0.s[0]
; CHECK-NEXT:    ret
    %c = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 0>
    ret <2 x i32> %c
}

define <4 x i32> @shufflevector_v4i32_zeroes(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: shufflevector_v4i32_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.4s, v0.s[0]
; CHECK-NEXT:    ret
    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
    ret <4 x i32> %c
}

define <2 x i64> @shufflevector_v2i64_zeroes(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: shufflevector_v2i64_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.2d, v0.d[0]
; CHECK-NEXT:    ret
    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
    ret <2 x i64> %c
}

; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====

define <2 x i1> @shufflevector_v2i1(<2 x i1> %a, <2 x i1> %b){
; CHECK-SD-LABEL: shufflevector_v2i1:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    mov v0.s[1], v1.s[1]
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v2i1:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    mov w8, v1.s[1]
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    mov v1.b[1], w8
; CHECK-GI-NEXT:    mov v0.b[1], w9
; CHECK-GI-NEXT:    mov b1, v1.b[1]
; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT:    umov w8, v0.b[0]
; CHECK-GI-NEXT:    umov w9, v0.b[1]
; CHECK-GI-NEXT:    mov v0.s[0], w8
; CHECK-GI-NEXT:    mov v0.s[1], w9
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <2 x i1> %a, <2 x i1> %b, <2 x i32> <i32 0, i32 3>
    ret <2 x i1> %c
}

define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v4i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    sub sp, sp, #16
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
; CHECK-SD-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
; CHECK-SD-NEXT:    zip1 v1.4h, v1.4h, v0.4h
; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT:    fmov w0, s0
; CHECK-SD-NEXT:    add sp, sp, #16
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    fmov w0, s0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 7>
    %d = bitcast <4 x i8> %c to i32
    ret i32 %d
}

define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v32i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
; CHECK-SD-NEXT:    adrp x8, .LCPI16_0
; CHECK-SD-NEXT:    adrp x9, .LCPI16_1
; CHECK-SD-NEXT:    mov v1.16b, v0.16b
; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI16_0]
; CHECK-SD-NEXT:    ldr q4, [x9, :lo12:.LCPI16_1]
; CHECK-SD-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v32i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    mov v3.16b, v0.16b
; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
; CHECK-GI-NEXT:    adrp x9, .LCPI16_0
; CHECK-GI-NEXT:    mov v4.16b, v2.16b
; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI16_1]
; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI16_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v3.16b, v4.16b }, v0.16b
; CHECK-GI-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v1.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
    ret <32 x i8> %c
}

define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v2i16:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    sub sp, sp, #16
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
; CHECK-SD-NEXT:    mov w8, v0.s[1]
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    strh w9, [sp, #12]
; CHECK-SD-NEXT:    strh w8, [sp, #14]
; CHECK-SD-NEXT:    ldr w0, [sp, #12]
; CHECK-SD-NEXT:    add sp, sp, #16
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v2i16:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    fmov w0, s0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
    %d = bitcast <2 x i16> %c to i32
    ret i32 %d
}

define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v16i16:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $q2 killed $q2 def $q1_q2
; CHECK-SD-NEXT:    adrp x8, .LCPI18_0
; CHECK-SD-NEXT:    adrp x9, .LCPI18_1
; CHECK-SD-NEXT:    mov v1.16b, v0.16b
; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
; CHECK-SD-NEXT:    ldr q4, [x9, :lo12:.LCPI18_1]
; CHECK-SD-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v3.16b
; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v4.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v16i16:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    mov v3.16b, v0.16b
; CHECK-GI-NEXT:    adrp x8, .LCPI18_1
; CHECK-GI-NEXT:    adrp x9, .LCPI18_0
; CHECK-GI-NEXT:    mov v4.16b, v2.16b
; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI18_1]
; CHECK-GI-NEXT:    ldr q1, [x9, :lo12:.LCPI18_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v3.16b, v4.16b }, v0.16b
; CHECK-GI-NEXT:    tbl v1.16b, { v3.16b, v4.16b }, v1.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
    ret <16 x i16> %c
}

define <1 x i32> @shufflevector_v1i32(<1 x i32> %a, <1 x i32> %b) {
; CHECK-LABEL: shufflevector_v1i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    fmov d0, d1
; CHECK-NEXT:    ret
    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 1>
    ret <1 x i32> %c
}

define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK-SD-LABEL: shufflevector_v8i32:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
; CHECK-SD-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT:    mov v2.s[3], v3.s[3]
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v8i32:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    adrp x8, .LCPI20_0
; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT:    uzp2 v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI20_0]
; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v4.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
    ret <8 x i32> %c
}

define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK-SD-LABEL: shufflevector_v4i64:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    zip2 v2.2d, v2.2d, v3.2d
; CHECK-SD-NEXT:    zip2 v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v4i64:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    zip2 v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT:    zip2 v1.2d, v2.2d, v3.2d
; CHECK-GI-NEXT:    ret
    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    ret <4 x i64> %c
}

; ===== Smaller/Larger Width Vectors with Zero Masks =====

define <2 x i1> @shufflevector_v2i1_zeroes(<2 x i1> %a, <2 x i1> %b){
; CHECK-SD-LABEL: shufflevector_v2i1_zeroes:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v2i1_zeroes:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    mov w8, v0.s[1]
; CHECK-GI-NEXT:    mov v0.b[1], w8
; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
; CHECK-GI-NEXT:    umov w8, v0.b[0]
; CHECK-GI-NEXT:    umov w9, v0.b[1]
; CHECK-GI-NEXT:    mov v0.s[0], w8
; CHECK-GI-NEXT:    mov v0.s[1], w9
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <2 x i1> %a, <2 x i1> %b, <2 x i32> <i32 0, i32 0>
    ret <2 x i1> %c
}

define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-LABEL: shufflevector_v4i8_zeroes:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    sub sp, sp, #16
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT:    fmov w0, s0
; CHECK-SD-NEXT:    add sp, sp, #16
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
; CHECK-GI-NEXT:    fmov w0, s0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
    %d = bitcast <4 x i8> %c to i32
    ret i32 %d
}

define <32 x i8> @shufflevector_v32i8_zeroes(<32 x i8> %a, <32 x i8> %b){
; CHECK-LABEL: shufflevector_v32i8_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.16b, v0.b[0]
; CHECK-NEXT:    mov v1.16b, v0.16b
; CHECK-NEXT:    ret
    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <32 x i8> %c
}

define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-LABEL: shufflevector_v2i16_zeroes:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    sub sp, sp, #16
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    dup v1.2s, v0.s[0]
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    strh w9, [sp, #12]
; CHECK-SD-NEXT:    mov w8, v1.s[1]
; CHECK-SD-NEXT:    strh w8, [sp, #14]
; CHECK-SD-NEXT:    ldr w0, [sp, #12]
; CHECK-SD-NEXT:    add sp, sp, #16
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT:    dup v0.4h, v0.h[0]
; CHECK-GI-NEXT:    fmov w0, s0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
    %d = bitcast <2 x i16> %c to i32
    ret i32 %d
}

define <16 x i16> @shufflevector_v16i16_zeroes(<16 x i16> %a, <16 x i16> %b){
; CHECK-LABEL: shufflevector_v16i16_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.8h, v0.h[0]
; CHECK-NEXT:    mov v1.16b, v0.16b
; CHECK-NEXT:    ret
    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <16 x i16> %c
}

define <1 x i32> @shufflevector_v1i32_zeroes(<1 x i32> %a, <1 x i32> %b) {
; CHECK-LABEL: shufflevector_v1i32_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ret
    %c = shufflevector <1 x i32> %a, <1 x i32> %b, <1 x i32> <i32 0>
    ret <1 x i32> %c
}

define <8 x i32> @shufflevector_v8i32_zeroes(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: shufflevector_v8i32_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.4s, v0.s[0]
; CHECK-NEXT:    mov v1.16b, v0.16b
; CHECK-NEXT:    ret
    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <8 x i32> %c
}

define <4 x i64> @shufflevector_v4i64_zeroes(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: shufflevector_v4i64_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.2d, v0.d[0]
; CHECK-NEXT:    mov v1.16b, v0.16b
; CHECK-NEXT:    ret
    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
    ret <4 x i64> %c
}

; ===== Vectors with Non-Pow 2 Widths =====

define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
; CHECK-SD-LABEL: shufflevector_v3i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    mov w0, w1
; CHECK-SD-NEXT:    mov w1, w2
; CHECK-SD-NEXT:    mov w2, w4
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v3i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    fmov s0, w0
; CHECK-GI-NEXT:    fmov s1, w3
; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
; CHECK-GI-NEXT:    mov v0.b[1], w1
; CHECK-GI-NEXT:    mov v1.b[1], w4
; CHECK-GI-NEXT:    mov v0.b[2], w2
; CHECK-GI-NEXT:    mov v1.b[2], w5
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    umov w0, v0.b[0]
; CHECK-GI-NEXT:    umov w1, v0.b[1]
; CHECK-GI-NEXT:    umov w2, v0.b[2]
; CHECK-GI-NEXT:    ret
    %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 1, i32 2, i32 4>
    ret <3 x i8> %c
}

define <7 x i8> @shufflevector_v7i8(<7 x i8> %a, <7 x i8> %b) {
; CHECK-SD-LABEL: shufflevector_v7i8:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    adrp x8, .LCPI31_0
; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
; CHECK-SD-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v7i8:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI31_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
    ret <7 x i8> %c
}

define <3 x i16> @shufflevector_v3i16(<3 x i16> %a, <3 x i16> %b) {
; CHECK-SD-LABEL: shufflevector_v3i16:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    zip1 v1.4h, v0.4h, v1.4h
; CHECK-SD-NEXT:    zip2 v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v3i16:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    adrp x8, .LCPI32_0
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI32_0]
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
    %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 1, i32 2, i32 4>
    ret <3 x i16> %c
}

define <7 x i16> @shufflevector_v7i16(<7 x i16> %a, <7 x i16> %b) {
; CHECK-SD-LABEL: shufflevector_v7i16:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    adrp x8, .LCPI33_0
; CHECK-SD-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-SD-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-SD-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v7i16:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12>
    ret <7 x i16> %c
}

define <3 x i32> @shufflevector_v3i32(<3 x i32> %a, <3 x i32> %b) {
; CHECK-SD-LABEL: shufflevector_v3i32:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    zip1 v1.4s, v0.4s, v1.4s
; CHECK-SD-NEXT:    zip2 v0.4s, v1.4s, v0.4s
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v3i32:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI34_0]
; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
; CHECK-GI-NEXT:    ret
    %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 1, i32 2, i32 4>
    ret <3 x i32> %c
}

; ===== Vectors with Non-Pow 2 Widths with Zero Masks =====

define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
; CHECK-SD-LABEL: shufflevector_v3i8_zeroes:
; CHECK-SD:       // %bb.0:
; CHECK-SD-NEXT:    mov w1, w0
; CHECK-SD-NEXT:    mov w2, w0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
; CHECK-GI:       // %bb.0:
; CHECK-GI-NEXT:    fmov s0, w0
; CHECK-GI-NEXT:    mov v0.b[1], w1
; CHECK-GI-NEXT:    mov v0.b[2], w2
; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
; CHECK-GI-NEXT:    umov w0, v0.b[0]
; CHECK-GI-NEXT:    umov w1, v0.b[1]
; CHECK-GI-NEXT:    umov w2, v0.b[2]
; CHECK-GI-NEXT:    ret
    %c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
    ret <3 x i8> %c
}

define <7 x i8> @shufflevector_v7i8_zeroes(<7 x i8> %a, <7 x i8> %b) {
; CHECK-LABEL: shufflevector_v7i8_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v0.8b, v0.b[0]
; CHECK-NEXT:    ret
    %c = shufflevector <7 x i8> %a, <7 x i8> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <7 x i8> %c
}

define <3 x i16> @shufflevector_v3i16_zeroes(<3 x i16> %a, <3 x i16> %b) {
; CHECK-LABEL: shufflevector_v3i16_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT:    dup v0.4h, v0.h[0]
; CHECK-NEXT:    ret
    %c = shufflevector <3 x i16> %a, <3 x i16> %b, <3 x i32> <i32 0, i32 0, i32 0>
    ret <3 x i16> %c
}

define <7 x i16> @shufflevector_v7i16_zeroes(<7 x i16> %a, <7 x i16> %b) {
; CHECK-LABEL: shufflevector_v7i16_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.8h, v0.h[0]
; CHECK-NEXT:    ret
    %c = shufflevector <7 x i16> %a, <7 x i16> %b, <7 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
    ret <7 x i16> %c
}

define <3 x i32> @shufflevector_v3i32_zeroes(<3 x i32> %a, <3 x i32> %b) {
; CHECK-LABEL: shufflevector_v3i32_zeroes:
; CHECK:       // %bb.0:
; CHECK-NEXT:    dup v0.4s, v0.s[0]
; CHECK-NEXT:    ret
    %c = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32> <i32 0, i32 0, i32 0>
    ret <3 x i32> %c
}