llvm/llvm/test/CodeGen/AArch64/zext-to-tbl.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck --check-prefix=CHECK-BE %s

; CHECK-LABEL: lCPI0_0:
; CHECK-NEXT:    .byte   0                               ; 0x0
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   1                               ; 0x1
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   2                               ; 0x2
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   3                               ; 0x3
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:lCPI0_1:
; CHECK-NEXT:    .byte   4                               ; 0x4
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   5                               ; 0x5
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   6                               ; 0x6
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   7                               ; 0x7
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:lCPI0_2:
; CHECK-NEXT:    .byte   8                               ; 0x8
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   9                               ; 0x9
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   10                              ; 0xa
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   11                              ; 0xb
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:lCPI0_3:
; CHECK-NEXT:    .byte   12                              ; 0xc
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   13                              ; 0xd
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   14                              ; 0xe
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   15                              ; 0xf
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff
; CHECK-NEXT:    .byte   255                             ; 0xff

; CHECK-BE: .LCPI0_0:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	0                               // 0x0
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	1                               // 0x1
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	2                               // 0x2
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	3                               // 0x3
; CHECK-BE-NEXT: .LCPI0_1:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	4                               // 0x4
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	5                               // 0x5
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	6                               // 0x6
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	7                               // 0x7
; CHECK-BE-NEXT: .LCPI0_2:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	8                               // 0x8
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	9                               // 0x9
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	10                              // 0xa
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	11                              // 0xb
; CHECK-BE-NEXT: .LCPI0_3:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	12                              // 0xc
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	13                              // 0xd
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	14                              // 0xe
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	15                              // 0xf

; It's profitable to convert the zext to a shuffle, which in turn will be
; lowered to 4 tbl instructions. The masks are materialized outside the loop.
define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh0:
; CHECK-NEXT:    adrp x8, lCPI0_0@PAGE
; CHECK-NEXT:  Lloh1:
; CHECK-NEXT:    adrp x9, lCPI0_1@PAGE
; CHECK-NEXT:  Lloh2:
; CHECK-NEXT:    adrp x10, lCPI0_2@PAGE
; CHECK-NEXT:  Lloh3:
; CHECK-NEXT:    ldr q0, [x8, lCPI0_0@PAGEOFF]
; CHECK-NEXT:  Lloh4:
; CHECK-NEXT:    adrp x8, lCPI0_3@PAGE
; CHECK-NEXT:  Lloh5:
; CHECK-NEXT:    ldr q1, [x9, lCPI0_1@PAGEOFF]
; CHECK-NEXT:  Lloh6:
; CHECK-NEXT:    ldr q2, [x10, lCPI0_2@PAGEOFF]
; CHECK-NEXT:  Lloh7:
; CHECK-NEXT:    ldr q3, [x8, lCPI0_3@PAGEOFF]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB0_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q4, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
; CHECK-NEXT:    tbl.16b v6, { v4 }, v2
; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
; CHECK-NEXT:    stp q6, q5, [x1, #32]
; CHECK-NEXT:    stp q4, q7, [x1], #64
; CHECK-NEXT:    b.ne LBB0_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh7
; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
; CHECK-NEXT:    .loh AdrpLdr Lloh1, Lloh5
; CHECK-NEXT:    .loh AdrpAdrp Lloh0, Lloh4
; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh3
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI0_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI0_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI0_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI0_3
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI0_3
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB0_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    st1 { v6.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v4.16b }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v7.16b }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB0_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <16 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i8_to_v16i32_in_loop_not_header(ptr %src, ptr %dst, i1 %c) {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_not_header:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:    b LBB1_2
; CHECK-NEXT:  LBB1_1: ; %loop.latch
; CHECK-NEXT:    ; in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    add x1, x1, #64
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    b.eq LBB1_4
; CHECK-NEXT:  LBB1_2: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    tbz w2, #0, LBB1_1
; CHECK-NEXT:  ; %bb.3: ; %then
; CHECK-NEXT:    ; in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT:    ldr q0, [x0, x8]
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v2, v1, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.4s v3, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    stp q1, q2, [x1, #32]
; CHECK-NEXT:    stp q0, q3, [x1]
; CHECK-NEXT:    b LBB1_1
; CHECK-NEXT:  LBB1_4: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_not_header:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:    b .LBB1_2
; CHECK-BE-NEXT:  .LBB1_1: // %loop.latch
; CHECK-BE-NEXT:    // in Loop: Header=BB1_2 Depth=1
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    b.eq .LBB1_4
; CHECK-BE-NEXT:  .LBB1_2: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    tbz w2, #0, .LBB1_1
; CHECK-BE-NEXT:  // %bb.3: // %then
; CHECK-BE-NEXT:    // in Loop: Header=BB1_2 Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x10, x1, #32
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
; CHECK-BE-NEXT:    b .LBB1_1
; CHECK-BE-NEXT:  .LBB1_4: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
  br i1 %c, label %then, label %loop.latch

then:
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <16 x i32> %ext, ptr %dst.gep
  br label %loop.latch

loop.latch:
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
; require more instructions than lowering zext directly.
define void @zext_v16i8_to_v16i32_no_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_no_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    ldr q0, [x0]
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v2, v1, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.4s v3, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    stp q1, q2, [x1, #32]
; CHECK-NEXT:    stp q0, q3, [x1]
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_no_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
; CHECK-BE-NEXT:    add x8, x1, #48
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    st1 { v2.4s }, [x8]
; CHECK-BE-NEXT:    add x8, x1, #32
; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
; CHECK-BE-NEXT:    add x8, x1, #16
; CHECK-BE-NEXT:    st1 { v3.4s }, [x8]
; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
; CHECK-BE-NEXT:    ret
entry:
  %load = load <16 x i8>, ptr %src
  %ext = zext <16 x i8> %load to <16 x i32>
  store <16 x i32> %ext, ptr %dst
  ret void
}

; Avoid using tbl when optimizing for size.
define void @zext_v16i8_to_v16i32_in_loop_optsize(ptr %src, ptr %dst) optsize {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_optsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB3_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v2, v1, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.4s v3, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    stp q1, q2, [x1, #32]
; CHECK-NEXT:    stp q0, q3, [x1], #64
; CHECK-NEXT:    b.ne LBB3_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_optsize:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB3_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    st1 { v1.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB3_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <16 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; Avoid using tbl when optimizing for size.
define void @zext_v16i8_to_v16i32_in_loop_minsize(ptr %src, ptr %dst) minsize {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_minsize:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB4_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v2, v1, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.4s v3, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    stp q1, q2, [x1, #32]
; CHECK-NEXT:    stp q0, q3, [x1], #64
; CHECK-NEXT:    b.ne LBB4_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_minsize:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB4_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    st1 { v1.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB4_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <16 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i8_to_v16i16_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i16_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB5_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    stp q0, q1, [x1], #32
; CHECK-NEXT:    b.ne LBB5_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i16_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB5_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #32
; CHECK-BE-NEXT:    st1 { v1.8h }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB5_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i16>
  %dst.gep = getelementptr i16, ptr %dst, i64 %iv
  store <16 x i16> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; CHECK-LABEL: lCPI6_0:
; CHECK-NEXT:     .byte   0                               ; 0x0
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   1                               ; 0x1
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   2                               ; 0x2
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   3                               ; 0x3
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT: lCPI6_1:
; CHECK-NEXT:     .byte   4                               ; 0x4
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   5                               ; 0x5
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   6                               ; 0x6
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   7                               ; 0x7
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff
; CHECK-NEXT:     .byte   255                             ; 0xff

; CHECK-BE:       .LCPI6_0:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	0                               // 0x0
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	1                               // 0x1
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	2                               // 0x2
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	3                               // 0x3
; CHECK-BE-NEXT: .LCPI6_1:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	4                               // 0x4
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	5                               // 0x5
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	6                               // 0x6
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	7                               // 0x7

define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh8:
; CHECK-NEXT:    adrp x8, lCPI6_0@PAGE
; CHECK-NEXT:  Lloh9:
; CHECK-NEXT:    adrp x9, lCPI6_1@PAGE
; CHECK-NEXT:  Lloh10:
; CHECK-NEXT:    ldr q0, [x8, lCPI6_0@PAGEOFF]
; CHECK-NEXT:  Lloh11:
; CHECK-NEXT:    ldr q1, [x9, lCPI6_1@PAGEOFF]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB6_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d2, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
; CHECK-NEXT:    stp q2, q3, [x1], #64
; CHECK-NEXT:    b.ne LBB6_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh9, Lloh11
; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh10
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI6_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI6_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI6_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB6_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    tbl v3.16b, { v2.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v2.16b }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v3.16b }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB6_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <8 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i8_to_v16i64_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i64_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB7_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v2, v1, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.4s v4, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    ushll2.2d v3, v2, #0
; CHECK-NEXT:    ushll.2d v2, v2, #0
; CHECK-NEXT:    ushll2.2d v5, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    stp q2, q3, [x1, #96]
; CHECK-NEXT:    ushll2.2d v3, v4, #0
; CHECK-NEXT:    ushll.2d v2, v4, #0
; CHECK-NEXT:    ushll2.2d v4, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    stp q1, q5, [x1, #64]
; CHECK-NEXT:    stp q2, q3, [x1, #32]
; CHECK-NEXT:    stp q0, q4, [x1], #128
; CHECK-NEXT:    b.ne LBB7_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB7_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #112
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v4.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.2d, v2.4s, #0
; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT:    ushll2 v5.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #96
; CHECK-BE-NEXT:    ushll2 v3.2d, v4.4s, #0
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #80
; CHECK-BE-NEXT:    ushll v2.2d, v4.2s, #0
; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    add x9, x1, #64
; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #128
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB7_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %ext = zext <16 x i8> %load to <16 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  store <16 x i64> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v8i8_to_v8i64_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i64_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB8_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v1, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    ushll2.2d v2, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    ushll2.2d v3, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    stp q1, q2, [x1, #32]
; CHECK-NEXT:    stp q0, q3, [x1], #128
; CHECK-NEXT:    b.ne LBB8_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i64_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB8_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    ushll2 v2.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #128
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB8_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  store <8 x i64> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v8i8_to_v8i16_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i16_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB9_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    str q0, [x1], #32
; CHECK-NEXT:    b.ne LBB9_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i16_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB9_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #32
; CHECK-BE-NEXT:    b.ne .LBB9_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret



entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i16>
  %dst.gep = getelementptr i16, ptr %dst, i64 %iv
  store <8 x i16> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v8i8_to_v8i20_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i20_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB10_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v1, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    mov.s w9, v1[1]
; CHECK-NEXT:    mov.s w11, v0[1]
; CHECK-NEXT:    fmov w10, s1
; CHECK-NEXT:    fmov w14, s0
; CHECK-NEXT:    mov.s w13, v1[2]
; CHECK-NEXT:    mov.s w16, v0[2]
; CHECK-NEXT:    mov.s w12, v1[3]
; CHECK-NEXT:    mov.s w15, v0[3]
; CHECK-NEXT:    orr x9, x10, x9, lsl #20
; CHECK-NEXT:    orr x10, x14, x11, lsl #20
; CHECK-NEXT:    orr x9, x9, x13, lsl #40
; CHECK-NEXT:    orr x10, x10, x16, lsl #40
; CHECK-NEXT:    lsr w11, w12, #4
; CHECK-NEXT:    lsr w13, w15, #4
; CHECK-NEXT:    orr x9, x9, x12, lsl #60
; CHECK-NEXT:    orr x10, x10, x15, lsl #60
; CHECK-NEXT:    strh w11, [x1, #18]
; CHECK-NEXT:    strh w13, [x1, #8]
; CHECK-NEXT:    stur x9, [x1, #10]
; CHECK-NEXT:    str x10, [x1], #64
; CHECK-NEXT:    b.ne LBB10_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i20_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB10_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    mov w9, v1.s[1]
; CHECK-BE-NEXT:    mov w10, v0.s[1]
; CHECK-BE-NEXT:    fmov w12, s1
; CHECK-BE-NEXT:    fmov w14, s0
; CHECK-BE-NEXT:    mov w11, v1.s[2]
; CHECK-BE-NEXT:    mov w13, v0.s[2]
; CHECK-BE-NEXT:    mov w15, v1.s[3]
; CHECK-BE-NEXT:    lsl x9, x9, #40
; CHECK-BE-NEXT:    lsl x10, x10, #40
; CHECK-BE-NEXT:    orr x9, x9, x12, lsl #60
; CHECK-BE-NEXT:    orr x10, x10, x14, lsl #60
; CHECK-BE-NEXT:    lsr x12, x12, #4
; CHECK-BE-NEXT:    strh w15, [x1, #18]
; CHECK-BE-NEXT:    orr x9, x9, x11, lsl #20
; CHECK-BE-NEXT:    orr x10, x10, x13, lsl #20
; CHECK-BE-NEXT:    mov w11, v0.s[3]
; CHECK-BE-NEXT:    lsr x13, x14, #4
; CHECK-BE-NEXT:    lsr x9, x9, #16
; CHECK-BE-NEXT:    lsr x10, x10, #16
; CHECK-BE-NEXT:    bfi x9, x12, #48, #4
; CHECK-BE-NEXT:    bfi x10, x13, #48, #4
; CHECK-BE-NEXT:    strh w11, [x1, #8]
; CHECK-BE-NEXT:    stur x9, [x1, #10]
; CHECK-BE-NEXT:    str x10, [x1], #64
; CHECK-BE-NEXT:    b.ne .LBB10_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i20>
  %dst.gep = getelementptr i20, ptr %dst, i64 %iv
  store <8 x i20> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB11_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr s0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    str q0, [x1], #64
; CHECK-NEXT:    b.ne LBB11_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI11_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI11_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB11_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ldr s1, [x0, x8]
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    rev32 v1.16b, v1.16b
; CHECK-BE-NEXT:    tbl v1.16b, { v1.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v1.16b }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    b.ne .LBB11_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <4 x i8>, ptr %src.gep
  %ext = zext <4 x i8> %load to <4 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <4 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; CHECK-LABEL: lCPI12_0:
; CHECK-NEXT: 	.byte	0                               ; 0x0
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	1                               ; 0x1
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	2                               ; 0x2
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	3                               ; 0x3
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI12_1:
; CHECK-NEXT: 	.byte	4                               ; 0x4
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	5                               ; 0x5
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	6                               ; 0x6
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	7                               ; 0x7
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI12_2:
; CHECK-NEXT: 	.byte	8                               ; 0x8
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	9                               ; 0x9
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	10                              ; 0xa
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	11                              ; 0xb
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff

; CHECK-BE-LABEL: .LCPI12_0:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	0                               // 0x0
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	1                               // 0x1
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	2                               // 0x2
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	3                               // 0x3
; CHECK-BE-NEXT: .LCPI12_1:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	4                               // 0x4
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	5                               // 0x5
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	6                               // 0x6
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	7                               // 0x7
; CHECK-BE-NEXT: .LCPI12_2:
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	8                               // 0x8
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	9                               // 0x9
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	10                              // 0xa
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	255                             // 0xff
; CHECK-BE-NEXT: 	.byte	11                              // 0xb

define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh12:
; CHECK-NEXT:    adrp x8, lCPI12_0@PAGE
; CHECK-NEXT:  Lloh13:
; CHECK-NEXT:    adrp x9, lCPI12_1@PAGE
; CHECK-NEXT:  Lloh14:
; CHECK-NEXT:    adrp x10, lCPI12_2@PAGE
; CHECK-NEXT:  Lloh15:
; CHECK-NEXT:    ldr q0, [x8, lCPI12_0@PAGEOFF]
; CHECK-NEXT:  Lloh16:
; CHECK-NEXT:    ldr q1, [x9, lCPI12_1@PAGEOFF]
; CHECK-NEXT:  Lloh17:
; CHECK-NEXT:    ldr q2, [x10, lCPI12_2@PAGEOFF]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB12_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q3, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    tbl.16b v4, { v3 }, v2
; CHECK-NEXT:    tbl.16b v5, { v3 }, v1
; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
; CHECK-NEXT:    stp q5, q4, [x1, #16]
; CHECK-NEXT:    str q3, [x1], #64
; CHECK-NEXT:    b.ne LBB12_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh17
; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh16
; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh15
;
; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI12_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI12_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI12_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI12_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI12_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI12_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB12_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    add x10, x1, #16
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    tbl v4.16b, { v3.16b }, v2.16b
; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v3.16b }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v4.16b }, [x9]
; CHECK-BE-NEXT:    st1 { v5.16b }, [x10]
; CHECK-BE-NEXT:    b.ne .LBB12_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <12 x i8>, ptr %src.gep
  %ext = zext <12 x i8> %load to <12 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <12 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i4_to_v16i32_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    movi.4s v0, #15
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB13_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x9, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    and w11, w9, #0xf
; CHECK-NEXT:    ubfx w10, w9, #4, #4
; CHECK-NEXT:    fmov s1, w11
; CHECK-NEXT:    mov.b v1[1], w10
; CHECK-NEXT:    ubfx w10, w9, #8, #4
; CHECK-NEXT:    mov.b v1[2], w10
; CHECK-NEXT:    ubfx w10, w9, #12, #4
; CHECK-NEXT:    mov.b v1[3], w10
; CHECK-NEXT:    ubfx w10, w9, #16, #4
; CHECK-NEXT:    mov.b v1[4], w10
; CHECK-NEXT:    ubfx w10, w9, #20, #4
; CHECK-NEXT:    mov.b v1[5], w10
; CHECK-NEXT:    ubfx w10, w9, #24, #4
; CHECK-NEXT:    mov.b v1[6], w10
; CHECK-NEXT:    ubfx x10, x9, #28, #4
; CHECK-NEXT:    mov.b v1[7], w10
; CHECK-NEXT:    ubfx x10, x9, #32, #4
; CHECK-NEXT:    mov.b v1[8], w10
; CHECK-NEXT:    ubfx x10, x9, #36, #4
; CHECK-NEXT:    mov.b v1[9], w10
; CHECK-NEXT:    ubfx x10, x9, #40, #4
; CHECK-NEXT:    mov.b v1[10], w10
; CHECK-NEXT:    ubfx x10, x9, #44, #4
; CHECK-NEXT:    mov.b v1[11], w10
; CHECK-NEXT:    ubfx x10, x9, #48, #4
; CHECK-NEXT:    mov.b v1[12], w10
; CHECK-NEXT:    ubfx x10, x9, #52, #4
; CHECK-NEXT:    mov.b v1[13], w10
; CHECK-NEXT:    ubfx x10, x9, #56, #4
; CHECK-NEXT:    lsr x9, x9, #60
; CHECK-NEXT:    mov.b v1[14], w10
; CHECK-NEXT:    mov.b v1[15], w9
; CHECK-NEXT:    ext.16b v2, v1, v1, #8
; CHECK-NEXT:    zip2.8b v3, v1, v0
; CHECK-NEXT:    zip1.8b v1, v1, v0
; CHECK-NEXT:    zip2.8b v4, v2, v0
; CHECK-NEXT:    zip1.8b v2, v2, v0
; CHECK-NEXT:    ushll.4s v3, v3, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    and.16b v3, v3, v0
; CHECK-NEXT:    ushll.4s v4, v4, #0
; CHECK-NEXT:    ushll.4s v2, v2, #0
; CHECK-NEXT:    and.16b v1, v1, v0
; CHECK-NEXT:    and.16b v4, v4, v0
; CHECK-NEXT:    and.16b v2, v2, v0
; CHECK-NEXT:    stp q1, q3, [x1]
; CHECK-NEXT:    stp q2, q4, [x1, #32]
; CHECK-NEXT:    add x1, x1, #64
; CHECK-NEXT:    b.ne LBB13_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i4_to_v16i32_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    movi v0.4s, #15
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB13_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ldr x9, [x0, x8]
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    lsr x10, x9, #60
; CHECK-BE-NEXT:    ubfx x11, x9, #56, #4
; CHECK-BE-NEXT:    fmov s1, w10
; CHECK-BE-NEXT:    ubfx x10, x9, #52, #4
; CHECK-BE-NEXT:    mov v1.b[1], w11
; CHECK-BE-NEXT:    mov v1.b[2], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #48, #4
; CHECK-BE-NEXT:    mov v1.b[3], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #44, #4
; CHECK-BE-NEXT:    mov v1.b[4], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #40, #4
; CHECK-BE-NEXT:    mov v1.b[5], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #36, #4
; CHECK-BE-NEXT:    mov v1.b[6], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #32, #4
; CHECK-BE-NEXT:    mov v1.b[7], w10
; CHECK-BE-NEXT:    ubfx x10, x9, #28, #4
; CHECK-BE-NEXT:    mov v1.b[8], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #24, #4
; CHECK-BE-NEXT:    mov v1.b[9], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #20, #4
; CHECK-BE-NEXT:    mov v1.b[10], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #16, #4
; CHECK-BE-NEXT:    mov v1.b[11], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #12, #4
; CHECK-BE-NEXT:    mov v1.b[12], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #8, #4
; CHECK-BE-NEXT:    mov v1.b[13], w10
; CHECK-BE-NEXT:    ubfx w10, w9, #4, #4
; CHECK-BE-NEXT:    and w9, w9, #0xf
; CHECK-BE-NEXT:    mov v1.b[14], w10
; CHECK-BE-NEXT:    add x10, x1, #32
; CHECK-BE-NEXT:    mov v1.b[15], w9
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
; CHECK-BE-NEXT:    zip2 v3.8b, v1.8b, v0.8b
; CHECK-BE-NEXT:    zip1 v1.8b, v1.8b, v0.8b
; CHECK-BE-NEXT:    zip2 v4.8b, v2.8b, v0.8b
; CHECK-BE-NEXT:    zip1 v2.8b, v2.8b, v0.8b
; CHECK-BE-NEXT:    ushll v3.4s, v3.4h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    and v3.16b, v3.16b, v0.16b
; CHECK-BE-NEXT:    ushll v4.4s, v4.4h, #0
; CHECK-BE-NEXT:    ushll v2.4s, v2.4h, #0
; CHECK-BE-NEXT:    and v1.16b, v1.16b, v0.16b
; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    and v4.16b, v4.16b, v0.16b
; CHECK-BE-NEXT:    and v2.16b, v2.16b, v0.16b
; CHECK-BE-NEXT:    st1 { v1.4s }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v4.4s }, [x9]
; CHECK-BE-NEXT:    st1 { v2.4s }, [x10]
; CHECK-BE-NEXT:    b.ne .LBB13_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i4, ptr %src, i64 %iv
  %load = load <16 x i4>, ptr %src.gep
  %ext = zext <16 x i4> %load to <16 x i32>
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <16 x i32> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i16_to_v16i64_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i16_to_v16i64_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB14_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    add x8, x8, #32
; CHECK-NEXT:    ldp q1, q0, [x9]
; CHECK-NEXT:    cmp x8, #256
; CHECK-NEXT:    ushll2.4s v2, v0, #0
; CHECK-NEXT:    ushll2.4s v3, v1, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    ushll.4s v1, v1, #0
; CHECK-NEXT:    ushll2.2d v4, v2, #0
; CHECK-NEXT:    ushll.2d v2, v2, #0
; CHECK-NEXT:    ushll2.2d v5, v3, #0
; CHECK-NEXT:    ushll.2d v3, v3, #0
; CHECK-NEXT:    stp q2, q4, [x1, #96]
; CHECK-NEXT:    ushll2.2d v4, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    ushll2.2d v2, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    stp q3, q5, [x1, #32]
; CHECK-NEXT:    stp q0, q4, [x1, #64]
; CHECK-NEXT:    stp q1, q2, [x1], #128
; CHECK-NEXT:    b.ne LBB14_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i16_to_v16i64_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB14_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #32
; CHECK-BE-NEXT:    ld1 { v0.8h }, [x9]
; CHECK-BE-NEXT:    add x9, x9, #16
; CHECK-BE-NEXT:    cmp x8, #256
; CHECK-BE-NEXT:    ld1 { v1.8h }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    ushll2 v2.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll2 v4.2d, v2.4s, #0
; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT:    ushll2 v5.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    ushll2 v4.2d, v3.4s, #0
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    ushll v2.2d, v3.2s, #0
; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #112
; CHECK-BE-NEXT:    ushll2 v3.2d, v1.4s, #0
; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #96
; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #80
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #64
; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #128
; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB14_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i16, ptr %src, i64 %iv
  %load = load <16 x i16>, ptr %src.gep
  %ext = zext <16 x i16> %load to <16 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  store <16 x i64> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i32_to_v16i64_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i32_to_v16i64_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB15_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    add x8, x8, #64
; CHECK-NEXT:    ldp q1, q0, [x9, #32]
; CHECK-NEXT:    cmp x8, #512
; CHECK-NEXT:    ldp q5, q4, [x9]
; CHECK-NEXT:    ushll2.2d v2, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    ushll2.2d v3, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    stp q0, q2, [x1, #96]
; CHECK-NEXT:    ushll2.2d v2, v4, #0
; CHECK-NEXT:    ushll.2d v0, v4, #0
; CHECK-NEXT:    stp q1, q3, [x1, #64]
; CHECK-NEXT:    ushll2.2d v3, v5, #0
; CHECK-NEXT:    ushll.2d v1, v5, #0
; CHECK-NEXT:    stp q0, q2, [x1, #32]
; CHECK-NEXT:    stp q1, q3, [x1], #128
; CHECK-NEXT:    b.ne LBB15_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i32_to_v16i64_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB15_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #64
; CHECK-BE-NEXT:    ld1 { v0.4s }, [x9]
; CHECK-BE-NEXT:    add x10, x9, #48
; CHECK-BE-NEXT:    cmp x8, #512
; CHECK-BE-NEXT:    ld1 { v1.4s }, [x10]
; CHECK-BE-NEXT:    add x10, x9, #32
; CHECK-BE-NEXT:    add x9, x9, #16
; CHECK-BE-NEXT:    ld1 { v4.4s }, [x9]
; CHECK-BE-NEXT:    ld1 { v2.4s }, [x10]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    add x10, x1, #80
; CHECK-BE-NEXT:    ushll2 v5.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll2 v6.2d, v2.4s, #0
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    ushll2 v3.2d, v4.4s, #0
; CHECK-BE-NEXT:    add x9, x1, #112
; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
; CHECK-BE-NEXT:    ushll v0.2d, v1.2s, #0
; CHECK-BE-NEXT:    ushll v1.2d, v2.2s, #0
; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    ushll v2.2d, v4.2s, #0
; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #64
; CHECK-BE-NEXT:    st1 { v6.2d }, [x10]
; CHECK-BE-NEXT:    add x10, x1, #96
; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    add x1, x1, #128
; CHECK-BE-NEXT:    st1 { v0.2d }, [x10]
; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
; CHECK-BE-NEXT:    b.ne .LBB15_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i32, ptr %src, i64 %iv
  %load = load <16 x i32>, ptr %src.gep
  %ext = zext <16 x i32> %load to <16 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  store <16 x i64> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i128_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB16_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d0, [x0, x8]
; CHECK-NEXT:    add x9, x1, #112
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    str xzr, [x1, #120]
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    str xzr, [x1, #104]
; CHECK-NEXT:    str xzr, [x1, #88]
; CHECK-NEXT:    str xzr, [x1, #72]
; CHECK-NEXT:    ushll2.4s v1, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    str xzr, [x1, #56]
; CHECK-NEXT:    str xzr, [x1, #40]
; CHECK-NEXT:    str xzr, [x1, #24]
; CHECK-NEXT:    ushll2.2d v2, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    ushll2.2d v3, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    str xzr, [x1, #8]
; CHECK-NEXT:    st1.d { v2 }[1], [x9]
; CHECK-NEXT:    add x9, x1, #80
; CHECK-NEXT:    st1.d { v1 }[1], [x9]
; CHECK-NEXT:    add x9, x1, #48
; CHECK-NEXT:    str d2, [x1, #96]
; CHECK-NEXT:    st1.d { v3 }[1], [x9]
; CHECK-NEXT:    add x9, x1, #16
; CHECK-NEXT:    str d1, [x1, #64]
; CHECK-NEXT:    str d3, [x1, #32]
; CHECK-NEXT:    str d0, [x1]
; CHECK-NEXT:    add x1, x1, #256
; CHECK-NEXT:    st1.d { v0 }[1], [x9]
; CHECK-NEXT:    b.ne LBB16_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i128_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB16_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #120
; CHECK-BE-NEXT:    str xzr, [x1, #112]
; CHECK-BE-NEXT:    str xzr, [x1, #96]
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    str xzr, [x1, #80]
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    str xzr, [x1, #64]
; CHECK-BE-NEXT:    str xzr, [x1, #48]
; CHECK-BE-NEXT:    str xzr, [x1, #32]
; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    str xzr, [x1, #16]
; CHECK-BE-NEXT:    str xzr, [x1]
; CHECK-BE-NEXT:    ushll2 v2.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    st1 { v2.d }[1], [x9]
; CHECK-BE-NEXT:    add x9, x1, #88
; CHECK-BE-NEXT:    st1 { v1.d }[1], [x9]
; CHECK-BE-NEXT:    add x9, x1, #56
; CHECK-BE-NEXT:    str d2, [x1, #104]
; CHECK-BE-NEXT:    st1 { v3.d }[1], [x9]
; CHECK-BE-NEXT:    add x9, x1, #24
; CHECK-BE-NEXT:    str d1, [x1, #72]
; CHECK-BE-NEXT:    str d3, [x1, #40]
; CHECK-BE-NEXT:    str d0, [x1, #8]
; CHECK-BE-NEXT:    add x1, x1, #256
; CHECK-BE-NEXT:    st1 { v0.d }[1], [x9]
; CHECK-BE-NEXT:    b.ne .LBB16_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i128>
  %dst.gep = getelementptr i128, ptr %dst, i64 %iv
  store <8 x i128> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; multiple back-to-back 'zext' of similar type of vectors combined with arithmetic operations
define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh18:
; CHECK-NEXT:    adrp x8, lCPI17_0@PAGE
; CHECK-NEXT:  Lloh19:
; CHECK-NEXT:    adrp x9, lCPI17_1@PAGE
; CHECK-NEXT:    mov w10, #128 ; =0x80
; CHECK-NEXT:  Lloh20:
; CHECK-NEXT:    ldr q0, [x8, lCPI17_0@PAGEOFF]
; CHECK-NEXT:  Lloh21:
; CHECK-NEXT:    ldr q1, [x9, lCPI17_1@PAGEOFF]
; CHECK-NEXT:    add x8, x1, #64
; CHECK-NEXT:    add x9, x0, #8
; CHECK-NEXT:  LBB17_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldp d2, d3, [x9, #-8]
; CHECK-NEXT:    subs x10, x10, #16
; CHECK-NEXT:    ldp q6, q5, [x8, #-32]
; CHECK-NEXT:    add x9, x9, #16
; CHECK-NEXT:    ldp q17, q16, [x8, #-64]
; CHECK-NEXT:    tbl.16b v4, { v2 }, v1
; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
; CHECK-NEXT:    tbl.16b v7, { v3 }, v1
; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
; CHECK-NEXT:    uaddw2.2d v5, v5, v4
; CHECK-NEXT:    uaddw.2d v4, v6, v4
; CHECK-NEXT:    uaddw2.2d v6, v16, v2
; CHECK-NEXT:    ldp q18, q16, [x8, #32]
; CHECK-NEXT:    uaddw.2d v2, v17, v2
; CHECK-NEXT:    stp q4, q5, [x8, #-32]
; CHECK-NEXT:    uaddw2.2d v5, v16, v7
; CHECK-NEXT:    ldp q16, q4, [x8]
; CHECK-NEXT:    uaddw.2d v7, v18, v7
; CHECK-NEXT:    stp q2, q6, [x8, #-64]
; CHECK-NEXT:    uaddw2.2d v4, v4, v3
; CHECK-NEXT:    uaddw.2d v2, v16, v3
; CHECK-NEXT:    stp q7, q5, [x8, #32]
; CHECK-NEXT:    stp q2, q4, [x8], #128
; CHECK-NEXT:    b.ne LBB17_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh21
; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh20
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x9, .LCPI17_0
; CHECK-BE-NEXT:    add x9, x9, :lo12:.LCPI17_0
; CHECK-BE-NEXT:    mov w8, #128 // =0x80
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
; CHECK-BE-NEXT:    adrp x9, .LCPI17_1
; CHECK-BE-NEXT:    add x9, x9, :lo12:.LCPI17_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #64
; CHECK-BE-NEXT:    add x10, x0, #8
; CHECK-BE-NEXT:  .LBB17_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ld1 { v2.8b }, [x10]
; CHECK-BE-NEXT:    sub x11, x10, #8
; CHECK-BE-NEXT:    add x15, x9, #32
; CHECK-BE-NEXT:    ld1 { v3.8b }, [x11]
; CHECK-BE-NEXT:    ld1 { v16.2d }, [x15]
; CHECK-BE-NEXT:    sub x11, x9, #64
; CHECK-BE-NEXT:    sub x12, x9, #32
; CHECK-BE-NEXT:    ld1 { v6.2d }, [x9]
; CHECK-BE-NEXT:    ld1 { v21.2d }, [x11]
; CHECK-BE-NEXT:    tbl v4.16b, { v2.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
; CHECK-BE-NEXT:    ld1 { v19.2d }, [x12]
; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
; CHECK-BE-NEXT:    sub x13, x9, #16
; CHECK-BE-NEXT:    sub x14, x9, #48
; CHECK-BE-NEXT:    add x16, x9, #48
; CHECK-BE-NEXT:    add x17, x9, #16
; CHECK-BE-NEXT:    ld1 { v22.2d }, [x13]
; CHECK-BE-NEXT:    subs x8, x8, #16
; CHECK-BE-NEXT:    add x10, x10, #16
; CHECK-BE-NEXT:    rev32 v7.8b, v4.8b
; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
; CHECK-BE-NEXT:    rev32 v17.8b, v2.8b
; CHECK-BE-NEXT:    ext v18.16b, v5.16b, v5.16b, #8
; CHECK-BE-NEXT:    ext v20.16b, v3.16b, v3.16b, #8
; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
; CHECK-BE-NEXT:    rev32 v3.8b, v3.8b
; CHECK-BE-NEXT:    uaddw v7.2d, v16.2d, v7.2s
; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
; CHECK-BE-NEXT:    uaddw v6.2d, v6.2d, v17.2s
; CHECK-BE-NEXT:    rev32 v17.8b, v18.8b
; CHECK-BE-NEXT:    rev32 v20.8b, v20.8b
; CHECK-BE-NEXT:    rev32 v2.8b, v2.8b
; CHECK-BE-NEXT:    ld1 { v16.2d }, [x16]
; CHECK-BE-NEXT:    ld1 { v18.2d }, [x14]
; CHECK-BE-NEXT:    uaddw v5.2d, v19.2d, v5.2s
; CHECK-BE-NEXT:    uaddw v3.2d, v21.2d, v3.2s
; CHECK-BE-NEXT:    st1 { v7.2d }, [x15]
; CHECK-BE-NEXT:    ld1 { v7.2d }, [x17]
; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x9, #128
; CHECK-BE-NEXT:    uaddw v4.2d, v16.2d, v4.2s
; CHECK-BE-NEXT:    st1 { v5.2d }, [x12]
; CHECK-BE-NEXT:    uaddw v5.2d, v22.2d, v17.2s
; CHECK-BE-NEXT:    st1 { v3.2d }, [x11]
; CHECK-BE-NEXT:    uaddw v3.2d, v18.2d, v20.2s
; CHECK-BE-NEXT:    uaddw v2.2d, v7.2d, v2.2s
; CHECK-BE-NEXT:    st1 { v4.2d }, [x16]
; CHECK-BE-NEXT:    st1 { v5.2d }, [x13]
; CHECK-BE-NEXT:    st1 { v3.2d }, [x14]
; CHECK-BE-NEXT:    st1 { v2.2d }, [x17]
; CHECK-BE-NEXT:    b.ne .LBB17_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %src.gep.2 = getelementptr i8, ptr %src.gep, i64 8
  %load.2 = load <8 x i8>, ptr %src.gep.2
  %ext = zext <8 x i8> %load to <8 x i64>
  %ext.2 = zext <8 x i8> %load.2 to <8 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  %load.dst = load <8 x i64>, ptr %dst.gep
  %dst.gep.2 = getelementptr i64, ptr %dst.gep, i64 8
  %load.dst.2 = load <8 x i64>, ptr %dst.gep.2
  %sum = add <8 x i64> %load.dst, %ext
  %sum.2 = add <8 x i64> %load.dst.2, %ext.2
  store <8 x i64> %sum, ptr %dst.gep
  store <8 x i64> %sum.2, ptr %dst.gep.2
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; multiple back-to-back 'zext' of similar type of vectors
define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov w8, #128 ; =0x80
; CHECK-NEXT:    add x9, x1, #128
; CHECK-NEXT:    add x10, x0, #16
; CHECK-NEXT:  LBB18_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldp q0, q1, [x10, #-16]
; CHECK-NEXT:    subs x8, x8, #16
; CHECK-NEXT:    add x10, x10, #16
; CHECK-NEXT:    ushll2.8h v2, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.8h v6, v1, #0
; CHECK-NEXT:    ushll.8h v1, v1, #0
; CHECK-NEXT:    ushll2.4s v3, v2, #0
; CHECK-NEXT:    ushll.4s v2, v2, #0
; CHECK-NEXT:    ushll2.4s v5, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    ushll2.2d v4, v3, #0
; CHECK-NEXT:    ushll.2d v3, v3, #0
; CHECK-NEXT:    ushll2.2d v7, v2, #0
; CHECK-NEXT:    ushll.2d v2, v2, #0
; CHECK-NEXT:    stp q3, q4, [x9, #-32]
; CHECK-NEXT:    ushll2.2d v4, v5, #0
; CHECK-NEXT:    ushll2.4s v3, v6, #0
; CHECK-NEXT:    ushll.2d v5, v5, #0
; CHECK-NEXT:    stp q2, q7, [x9, #-64]
; CHECK-NEXT:    ushll2.2d v7, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    ushll.4s v2, v6, #0
; CHECK-NEXT:    stp q5, q4, [x9, #-96]
; CHECK-NEXT:    ushll2.2d v4, v3, #0
; CHECK-NEXT:    ushll2.4s v5, v1, #0
; CHECK-NEXT:    ushll.2d v3, v3, #0
; CHECK-NEXT:    stp q0, q7, [x9, #-128]
; CHECK-NEXT:    ushll.4s v0, v1, #0
; CHECK-NEXT:    ushll2.2d v6, v2, #0
; CHECK-NEXT:    ushll.2d v1, v2, #0
; CHECK-NEXT:    ushll2.2d v2, v5, #0
; CHECK-NEXT:    stp q3, q4, [x9, #96]
; CHECK-NEXT:    ushll.2d v3, v5, #0
; CHECK-NEXT:    ushll2.2d v4, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    stp q1, q6, [x9, #64]
; CHECK-NEXT:    stp q3, q2, [x9, #32]
; CHECK-NEXT:    stp q0, q4, [x9], #128
; CHECK-NEXT:    b.ne LBB18_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov w8, #128 // =0x80
; CHECK-BE-NEXT:    add x9, x1, #128
; CHECK-BE-NEXT:    add x10, x0, #16
; CHECK-BE-NEXT:  .LBB18_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    sub x11, x10, #16
; CHECK-BE-NEXT:    ld1 { v5.16b }, [x10]
; CHECK-BE-NEXT:    sub x12, x9, #32
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x11]
; CHECK-BE-NEXT:    sub x11, x9, #16
; CHECK-BE-NEXT:    subs x8, x8, #16
; CHECK-BE-NEXT:    add x10, x10, #16
; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    ushll2 v4.2d, v2.4s, #0
; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT:    ushll2 v6.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
; CHECK-BE-NEXT:    st1 { v4.2d }, [x11]
; CHECK-BE-NEXT:    ushll2 v4.2d, v3.4s, #0
; CHECK-BE-NEXT:    ushll v3.2d, v3.2s, #0
; CHECK-BE-NEXT:    st1 { v2.2d }, [x12]
; CHECK-BE-NEXT:    ushll2 v2.8h, v5.16b, #0
; CHECK-BE-NEXT:    sub x12, x9, #80
; CHECK-BE-NEXT:    sub x11, x9, #48
; CHECK-BE-NEXT:    st1 { v4.2d }, [x12]
; CHECK-BE-NEXT:    ushll v4.8h, v5.8b, #0
; CHECK-BE-NEXT:    sub x12, x9, #64
; CHECK-BE-NEXT:    ushll2 v5.4s, v2.8h, #0
; CHECK-BE-NEXT:    st1 { v1.2d }, [x12]
; CHECK-BE-NEXT:    sub x12, x9, #96
; CHECK-BE-NEXT:    ushll2 v1.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v2.4s, v2.4h, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    st1 { v6.2d }, [x11]
; CHECK-BE-NEXT:    sub x11, x9, #128
; CHECK-BE-NEXT:    st1 { v3.2d }, [x12]
; CHECK-BE-NEXT:    ushll2 v3.4s, v4.8h, #0
; CHECK-BE-NEXT:    ushll2 v6.2d, v5.4s, #0
; CHECK-BE-NEXT:    sub x12, x9, #112
; CHECK-BE-NEXT:    ushll v5.2d, v5.2s, #0
; CHECK-BE-NEXT:    st1 { v0.2d }, [x11]
; CHECK-BE-NEXT:    st1 { v1.2d }, [x12]
; CHECK-BE-NEXT:    ushll2 v1.2d, v2.4s, #0
; CHECK-BE-NEXT:    add x11, x9, #112
; CHECK-BE-NEXT:    ushll v4.4s, v4.4h, #0
; CHECK-BE-NEXT:    ushll2 v0.2d, v3.4s, #0
; CHECK-BE-NEXT:    st1 { v6.2d }, [x11]
; CHECK-BE-NEXT:    add x11, x9, #96
; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
; CHECK-BE-NEXT:    ushll v3.2d, v3.2s, #0
; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
; CHECK-BE-NEXT:    add x11, x9, #80
; CHECK-BE-NEXT:    st1 { v1.2d }, [x11]
; CHECK-BE-NEXT:    add x11, x9, #48
; CHECK-BE-NEXT:    ushll2 v1.2d, v4.4s, #0
; CHECK-BE-NEXT:    st1 { v0.2d }, [x11]
; CHECK-BE-NEXT:    ushll v0.2d, v4.2s, #0
; CHECK-BE-NEXT:    add x11, x9, #64
; CHECK-BE-NEXT:    st1 { v2.2d }, [x11]
; CHECK-BE-NEXT:    add x11, x9, #32
; CHECK-BE-NEXT:    st1 { v3.2d }, [x11]
; CHECK-BE-NEXT:    add x11, x9, #16
; CHECK-BE-NEXT:    st1 { v0.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x9, #128
; CHECK-BE-NEXT:    st1 { v1.2d }, [x11]
; CHECK-BE-NEXT:    b.ne .LBB18_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <16 x i8>, ptr %src.gep
  %src.gep.2 = getelementptr i8, ptr %src.gep, i64 16
  %load.2 = load <16 x i8>, ptr %src.gep.2
  %ext = zext <16 x i8> %load to <16 x i64>
  %ext.2 = zext <16 x i8> %load.2 to <16 x i64>
  %dst.gep = getelementptr i64, ptr %dst, i64 %iv
  store <16 x i64> %ext, ptr %dst.gep
  %dst.gep.2 = getelementptr i64, ptr %dst.gep, i64 16
  store <16 x i64> %ext.2, ptr %dst.gep.2
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    ptrue p0.s
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB19_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x9, #2, mul vl]
; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x9, #3, mul vl]
; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x9, #1, mul vl]
; CHECK-NEXT:    add x9, x1, x8, lsl #2
; CHECK-NEXT:    add z0.s, z0.s, z0.s
; CHECK-NEXT:    add z1.s, z1.s, z1.s
; CHECK-NEXT:    add z2.s, z2.s, z2.s
; CHECK-NEXT:    add z3.s, z3.s, z3.s
; CHECK-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
; CHECK-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
; CHECK-NEXT:    st1w { z3.s }, p0, [x9, #1, mul vl]
; CHECK-NEXT:    b.ne LBB19_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    ptrue p0.s
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB19_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    ld1b { z1.s }, p0/z, [x9, #2, mul vl]
; CHECK-BE-NEXT:    ld1b { z2.s }, p0/z, [x9, #3, mul vl]
; CHECK-BE-NEXT:    ld1b { z3.s }, p0/z, [x9, #1, mul vl]
; CHECK-BE-NEXT:    add x9, x1, x8, lsl #2
; CHECK-BE-NEXT:    add z0.s, z0.s, z0.s
; CHECK-BE-NEXT:    add z1.s, z1.s, z1.s
; CHECK-BE-NEXT:    add z2.s, z2.s, z2.s
; CHECK-BE-NEXT:    add z3.s, z3.s, z3.s
; CHECK-BE-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
; CHECK-BE-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
; CHECK-BE-NEXT:    st1w { z3.s }, p0, [x9, #1, mul vl]
; CHECK-BE-NEXT:    b.ne .LBB19_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <vscale x 16 x i8>, ptr %src.gep
  %ext = zext <vscale x 16 x i8> %load to <vscale x 16 x i32>
  %add = add <vscale x 16 x i32> %ext, %ext
  %dst.gep = getelementptr i32, ptr %dst, i64 %iv
  store <vscale x 16 x i32> %add, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; CHECK-LABEL:  lCPI20_0:
; CHECK-NEXT:	.byte	0                               ; 0x0
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	1                               ; 0x1
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	2                               ; 0x2
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	3                               ; 0x3
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	4                               ; 0x4
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	5                               ; 0x5
; CHECK-NEXT:lCPI20_1:
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	6                               ; 0x6
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	7                               ; 0x7
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	8                               ; 0x8
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	9                               ; 0x9
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	10                              ; 0xa
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:lCPI20_2:
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	11                              ; 0xb
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	12                              ; 0xc
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	13                              ; 0xd
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	14                              ; 0xe
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	15                              ; 0xf
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:lCPI20_3:
; CHECK-NEXT:	.byte	0                               ; 0x0
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	1                               ; 0x1
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	2                               ; 0x2
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	3                               ; 0x3
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff
; CHECK-NEXT:	.byte	255                             ; 0xff

; CHECK-BE-LABEL: .LCPI20_0:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	0                               // 0x0
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	1                               // 0x1
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	2                               // 0x2
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	3                               // 0x3
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI20_1:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	0                               // 0x0
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	1                               // 0x1
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	2                               // 0x2
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	3                               // 0x3
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	4                               // 0x4
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI20_2:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	5                               // 0x5
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	6                               // 0x6
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	7                               // 0x7
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	8                               // 0x8
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	9                               // 0x9
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI20_3:
; CHECK-BE-NEXT:  	.byte	10                              // 0xa
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	11                              // 0xb
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	12                              // 0xc
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	13                              // 0xd
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	14                              // 0xe
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	15                              // 0xf

define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh22:
; CHECK-NEXT:    adrp x8, lCPI20_0@PAGE
; CHECK-NEXT:  Lloh23:
; CHECK-NEXT:    adrp x9, lCPI20_1@PAGE
; CHECK-NEXT:  Lloh24:
; CHECK-NEXT:    adrp x10, lCPI20_2@PAGE
; CHECK-NEXT:  Lloh25:
; CHECK-NEXT:    ldr q0, [x8, lCPI20_0@PAGEOFF]
; CHECK-NEXT:  Lloh26:
; CHECK-NEXT:    adrp x8, lCPI20_3@PAGE
; CHECK-NEXT:  Lloh27:
; CHECK-NEXT:    ldr q1, [x9, lCPI20_1@PAGEOFF]
; CHECK-NEXT:  Lloh28:
; CHECK-NEXT:    ldr q2, [x10, lCPI20_2@PAGEOFF]
; CHECK-NEXT:  Lloh29:
; CHECK-NEXT:    ldr q3, [x8, lCPI20_3@PAGEOFF]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB20_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    ldp q5, q4, [x9]
; CHECK-NEXT:    add x9, x1, #56
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
; CHECK-NEXT:    tbl.16b v6, { v5 }, v2
; CHECK-NEXT:    tbl.16b v7, { v5 }, v1
; CHECK-NEXT:    tbl.16b v5, { v5 }, v0
; CHECK-NEXT:    stp q7, q6, [x1, #16]
; CHECK-NEXT:    str q5, [x1]
; CHECK-NEXT:    str d4, [x1, #48]
; CHECK-NEXT:    add x1, x1, #64
; CHECK-NEXT:    st1.s { v4 }[2], [x9]
; CHECK-NEXT:    b.ne LBB20_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh29
; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh28
; CHECK-NEXT:    .loh AdrpLdr Lloh23, Lloh27
; CHECK-NEXT:    .loh AdrpAdrp Lloh22, Lloh26
; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh25
;
; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI20_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI20_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI20_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI20_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI20_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI20_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI20_3
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI20_3
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB20_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    add x10, x9, #16
; CHECK-BE-NEXT:    ld1 { v5.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    ld1 { v4.16b }, [x10]
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    tbl v6.16b, { v5.16b }, v3.16b
; CHECK-BE-NEXT:    tbl v7.16b, { v5.16b }, v2.16b
; CHECK-BE-NEXT:    tbl v5.16b, { v5.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v6.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    rev32 v16.16b, v4.16b
; CHECK-BE-NEXT:    rev64 v4.16b, v4.16b
; CHECK-BE-NEXT:    st1 { v7.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #56
; CHECK-BE-NEXT:    st1 { v5.16b }, [x1]
; CHECK-BE-NEXT:    str d4, [x1, #48]
; CHECK-BE-NEXT:    add x1, x1, #64
; CHECK-BE-NEXT:    st1 { v16.s }[2], [x9]
; CHECK-BE-NEXT:    b.ne .LBB20_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret

entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <20 x i8>, ptr %src.gep
  %ext = zext <20 x i8> %load to <20 x i24>
  %dst.gep = getelementptr i24, ptr %dst, i64 %iv
  store <20 x i24> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; CHECK-LABEL:  lCPI21_0:
; CHECK-NEXT: 	.byte	0                               ; 0x0
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	1                               ; 0x1
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	2                               ; 0x2
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI21_1:
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	3                               ; 0x3
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	4                               ; 0x4
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	5                               ; 0x5
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI21_2:
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	6                               ; 0x6
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	7                               ; 0x7
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI21_3:
; CHECK-NEXT: 	.byte	8                               ; 0x8
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	9                               ; 0x9
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	10                              ; 0xa
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI21_4:
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	11                              ; 0xb
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	12                              ; 0xc
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	13                              ; 0xd
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: lCPI21_5:
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	14                              ; 0xe
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	15                              ; 0xf
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff
; CHECK-NEXT: 	.byte	255                             ; 0xff

; CHECK-BE-LABEL:  .LCPI21_0:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	5                               // 0x5
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	6                               // 0x6
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI21_1:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	0                               // 0x0
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	1                               // 0x1
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI21_2:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	2                               // 0x2
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	3                               // 0x3
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	4                               // 0x4
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI21_3:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	5                               // 0x5
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	6                               // 0x6
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	7                               // 0x7
; CHECK-BE-NEXT:  .LCPI21_4:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	8                               // 0x8
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	9                               // 0x9
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI21_5:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	10                              // 0xa
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	11                              // 0xb
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	12                              // 0xc
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  .LCPI21_6:
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	13                              // 0xd
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	14                              // 0xe
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	255                             // 0xff
; CHECK-BE-NEXT:  	.byte	15                              // 0xf

define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh30:
; CHECK-NEXT:    adrp x8, lCPI21_0@PAGE
; CHECK-NEXT:  Lloh31:
; CHECK-NEXT:    adrp x9, lCPI21_1@PAGE
; CHECK-NEXT:  Lloh32:
; CHECK-NEXT:    adrp x10, lCPI21_2@PAGE
; CHECK-NEXT:  Lloh33:
; CHECK-NEXT:    ldr q0, [x8, lCPI21_0@PAGEOFF]
; CHECK-NEXT:  Lloh34:
; CHECK-NEXT:    ldr q1, [x9, lCPI21_1@PAGEOFF]
; CHECK-NEXT:  Lloh35:
; CHECK-NEXT:    ldr q2, [x10, lCPI21_2@PAGEOFF]
; CHECK-NEXT:  Lloh36:
; CHECK-NEXT:    adrp x8, lCPI21_3@PAGE
; CHECK-NEXT:  Lloh37:
; CHECK-NEXT:    adrp x9, lCPI21_4@PAGE
; CHECK-NEXT:  Lloh38:
; CHECK-NEXT:    adrp x10, lCPI21_5@PAGE
; CHECK-NEXT:  Lloh39:
; CHECK-NEXT:    ldr q3, [x8, lCPI21_3@PAGEOFF]
; CHECK-NEXT:  Lloh40:
; CHECK-NEXT:    ldr q4, [x9, lCPI21_4@PAGEOFF]
; CHECK-NEXT:  Lloh41:
; CHECK-NEXT:    ldr q5, [x10, lCPI21_5@PAGEOFF]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB21_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    add x9, x0, x8
; CHECK-NEXT:    movi.2d v19, #0000000000000000
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    ldp q7, q6, [x9]
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    strh wzr, [x1, #136]
; CHECK-NEXT:    tbl.16b v16, { v6 }, v1
; CHECK-NEXT:    tbl.16b v17, { v6 }, v0
; CHECK-NEXT:    mov.b v19[4], v6[6]
; CHECK-NEXT:    tbl.16b v18, { v7 }, v5
; CHECK-NEXT:    tbl.16b v20, { v7 }, v4
; CHECK-NEXT:    tbl.16b v21, { v7 }, v3
; CHECK-NEXT:    stp q17, q16, [x1, #96]
; CHECK-NEXT:    tbl.16b v16, { v7 }, v2
; CHECK-NEXT:    tbl.16b v17, { v7 }, v1
; CHECK-NEXT:    tbl.16b v7, { v7 }, v0
; CHECK-NEXT:    fmov x9, d19
; CHECK-NEXT:    stp q20, q18, [x1, #64]
; CHECK-NEXT:    stp q16, q21, [x1, #32]
; CHECK-NEXT:    stp q7, q17, [x1]
; CHECK-NEXT:    str x9, [x1, #128]!
; CHECK-NEXT:    b.ne LBB21_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh38, Lloh41
; CHECK-NEXT:    .loh AdrpLdr Lloh37, Lloh40
; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh39
; CHECK-NEXT:    .loh AdrpAdrp Lloh32, Lloh38
; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh35
; CHECK-NEXT:    .loh AdrpAdrp Lloh31, Lloh37
; CHECK-NEXT:    .loh AdrpLdr Lloh31, Lloh34
; CHECK-NEXT:    .loh AdrpAdrp Lloh30, Lloh36
; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh33
;
; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI21_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_3
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_3
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_4
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_4
; CHECK-BE-NEXT:    ld1 { v4.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_5
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_5
; CHECK-BE-NEXT:    ld1 { v5.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI21_6
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI21_6
; CHECK-BE-NEXT:    ld1 { v6.16b }, [x8]
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB21_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v7.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x9, #16
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ld1 { v17.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #80
; CHECK-BE-NEXT:    tbl v16.16b, { v7.16b }, v6.16b
; CHECK-BE-NEXT:    tbl v18.16b, { v7.16b }, v5.16b
; CHECK-BE-NEXT:    tbl v19.16b, { v7.16b }, v4.16b
; CHECK-BE-NEXT:    tbl v20.16b, { v7.16b }, v3.16b
; CHECK-BE-NEXT:    tbl v21.16b, { v17.16b }, v0.16b
; CHECK-BE-NEXT:    st1 { v16.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #64
; CHECK-BE-NEXT:    tbl v16.16b, { v7.16b }, v2.16b
; CHECK-BE-NEXT:    st1 { v18.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    tbl v18.16b, { v17.16b }, v2.16b
; CHECK-BE-NEXT:    st1 { v19.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #32
; CHECK-BE-NEXT:    tbl v17.16b, { v17.16b }, v1.16b
; CHECK-BE-NEXT:    st1 { v20.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #16
; CHECK-BE-NEXT:    rev64 v19.16b, v21.16b
; CHECK-BE-NEXT:    st1 { v16.16b }, [x9]
; CHECK-BE-NEXT:    rev16 v16.16b, v21.16b
; CHECK-BE-NEXT:    add x9, x1, #112
; CHECK-BE-NEXT:    st1 { v18.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #96
; CHECK-BE-NEXT:    tbl v7.16b, { v7.16b }, v1.16b
; CHECK-BE-NEXT:    st1 { v17.16b }, [x9]
; CHECK-BE-NEXT:    add x9, x1, #136
; CHECK-BE-NEXT:    st1 { v16.h }[4], [x9]
; CHECK-BE-NEXT:    fmov x9, d19
; CHECK-BE-NEXT:    st1 { v7.16b }, [x1]
; CHECK-BE-NEXT:    str x9, [x1, #128]!
; CHECK-BE-NEXT:    b.ne .LBB21_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
; CHECK      :  Lloh30:



entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <23 x i8>, ptr %src.gep
  %ext = zext <23 x i8> %load to <23 x i48>
  %dst.gep = getelementptr i48, ptr %dst, i64 %iv
  store <23 x i48> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i33_in_loop:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  LBB22_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr d0, [x0, x8]
; CHECK-NEXT:    add x8, x8, #16
; CHECK-NEXT:    strb wzr, [x1, #32]
; CHECK-NEXT:    cmp x8, #128
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    ushll2.4s v1, v0, #0
; CHECK-NEXT:    ushll.4s v0, v0, #0
; CHECK-NEXT:    ushll2.2d v2, v1, #0
; CHECK-NEXT:    ushll.2d v1, v1, #0
; CHECK-NEXT:    ushll2.2d v3, v0, #0
; CHECK-NEXT:    ushll.2d v0, v0, #0
; CHECK-NEXT:    mov.d x9, v2[1]
; CHECK-NEXT:    mov.d x10, v1[1]
; CHECK-NEXT:    fmov x12, d2
; CHECK-NEXT:    mov.d x11, v3[1]
; CHECK-NEXT:    mov.d x13, v0[1]
; CHECK-NEXT:    lsl x9, x9, #39
; CHECK-NEXT:    lsl x10, x10, #37
; CHECK-NEXT:    lsl x11, x11, #35
; CHECK-NEXT:    orr x9, x9, x12, lsl #6
; CHECK-NEXT:    fmov x12, d1
; CHECK-NEXT:    orr x10, x10, x12, lsl #4
; CHECK-NEXT:    fmov x12, d3
; CHECK-NEXT:    stp x10, x9, [x1, #16]
; CHECK-NEXT:    orr x11, x11, x12, lsl #2
; CHECK-NEXT:    fmov x12, d0
; CHECK-NEXT:    orr x9, x12, x13, lsl #33
; CHECK-NEXT:    stp x9, x11, [x1], #128
; CHECK-NEXT:    b.ne LBB22_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i33_in_loop:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    mov x8, xzr
; CHECK-BE-NEXT:  .LBB22_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    add x9, x0, x8
; CHECK-BE-NEXT:    add x8, x8, #16
; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
; CHECK-BE-NEXT:    cmp x8, #128
; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT:    ushll v2.2d, v1.2s, #0
; CHECK-BE-NEXT:    ushll2 v1.2d, v1.4s, #0
; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
; CHECK-BE-NEXT:    mov x9, v2.d[1]
; CHECK-BE-NEXT:    mov x10, v1.d[1]
; CHECK-BE-NEXT:    fmov x13, d1
; CHECK-BE-NEXT:    mov x11, v3.d[1]
; CHECK-BE-NEXT:    mov x12, v0.d[1]
; CHECK-BE-NEXT:    fmov x14, d2
; CHECK-BE-NEXT:    fmov x15, d3
; CHECK-BE-NEXT:    lsl x9, x9, #2
; CHECK-BE-NEXT:    orr x13, x10, x13, lsl #33
; CHECK-BE-NEXT:    strb w10, [x1, #32]
; CHECK-BE-NEXT:    lsl x11, x11, #4
; CHECK-BE-NEXT:    lsl x12, x12, #6
; CHECK-BE-NEXT:    orr x14, x9, x14, lsl #35
; CHECK-BE-NEXT:    extr x9, x9, x13, #8
; CHECK-BE-NEXT:    fmov x13, d0
; CHECK-BE-NEXT:    orr x15, x11, x15, lsl #37
; CHECK-BE-NEXT:    extr x10, x11, x14, #8
; CHECK-BE-NEXT:    orr x11, x12, x13, lsl #39
; CHECK-BE-NEXT:    extr x12, x12, x15, #8
; CHECK-BE-NEXT:    stp x10, x9, [x1, #16]
; CHECK-BE-NEXT:    lsr x9, x11, #8
; CHECK-BE-NEXT:    stp x9, x12, [x1], #128
; CHECK-BE-NEXT:    b.ne .LBB22_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret


entry:
  br label %loop

loop:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
  %src.gep = getelementptr i8, ptr %src, i64 %iv
  %load = load <8 x i8>, ptr %src.gep
  %ext = zext <8 x i8> %load to <8 x i33>
  %dst.gep = getelementptr i33, ptr %dst, i64 %iv
  store <8 x i33> %ext, ptr %dst.gep
  %iv.next = add nuw i64 %iv, 16
  %ec = icmp eq i64 %iv.next, 128
  br i1 %ec, label %exit, label %loop

exit:
  ret void
}

; FIXME: Widening instructions should be used instead of tbl.
define i32 @test_pr62620_widening_instr(ptr %p1, ptr %p2, i64 %lx, i32 %h) {
; CHECK-LABEL: test_pr62620_widening_instr:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    lsl x9, x2, #4
; CHECK-NEXT:    mov x8, x0
; CHECK-NEXT:    mov w0, wzr
; CHECK-NEXT:    ldr q0, [x8, x9]
; CHECK-NEXT:    ldr q1, [x1, x9]
; CHECK-NEXT:    uabdl.8h v2, v0, v1
; CHECK-NEXT:    uabal2.8h v2, v0, v1
; CHECK-NEXT:    uaddlv.8h s0, v2
; CHECK-NEXT:    fmov w8, s0
; CHECK-NEXT:  LBB23_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    subs w3, w3, #1
; CHECK-NEXT:    add w0, w8, w0
; CHECK-NEXT:    b.ne LBB23_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: test_pr62620_widening_instr:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    lsl x9, x2, #4
; CHECK-BE-NEXT:    mov x8, x0
; CHECK-BE-NEXT:    mov w0, wzr
; CHECK-BE-NEXT:    add x8, x8, x9
; CHECK-BE-NEXT:    add x9, x1, x9
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
; CHECK-BE-NEXT:    uabdl v2.8h, v0.8b, v1.8b
; CHECK-BE-NEXT:    uabal2 v2.8h, v0.16b, v1.16b
; CHECK-BE-NEXT:    uaddlv s0, v2.8h
; CHECK-BE-NEXT:    fmov w8, s0
; CHECK-BE-NEXT:  .LBB23_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    subs w3, w3, #1
; CHECK-BE-NEXT:    add w0, w8, w0
; CHECK-BE-NEXT:    b.ne .LBB23_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %s0 = phi i32 [ 0, %entry ], [ %op.rdx, %loop ]
  %j.0261 = phi i32 [ 0, %entry ], [ %inc, %loop ]
  %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i64 %lx
  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i64 %lx
  %l1 = load <16 x i8>, ptr %gep.1
  %z2 = zext <16 x i8> %l1 to <16 x i32>
  %l4 = load <16 x i8>, ptr %gep.2
  %z5 = zext <16 x i8> %l4 to <16 x i32>
  %sub = sub nsw <16 x i32> %z2, %z5
  %abs = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 true)
  %red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %abs)
  %op.rdx = add i32 %red, %s0
  %inc = add nuw nsw i32 %j.0261, 1
  %exitcond.not = icmp eq i32 %inc, %h
  br i1 %exitcond.not, label %exit, label %loop

exit:
  %s1 = phi i32 [ %op.rdx, %loop ]
  ret i32 %s1
}

declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1 immarg)

declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)

define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
; CHECK-LABEL: test_widening_instr_mull:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:    mov x8, x0
; CHECK-NEXT:  LBB24_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q0, [x1], #16
; CHECK-NEXT:    ldr q3, [x0]
; CHECK-NEXT:    ldr q2, [x8, #16]!
; CHECK-NEXT:    subs w2, w2, #1
; CHECK-NEXT:    ushll2.8h v1, v0, #0
; CHECK-NEXT:    ushll.8h v0, v0, #0
; CHECK-NEXT:    umull2.4s v4, v2, v1
; CHECK-NEXT:    umull.4s v1, v2, v1
; CHECK-NEXT:    umull2.4s v2, v3, v0
; CHECK-NEXT:    umull.4s v0, v3, v0
; CHECK-NEXT:    stp q1, q4, [x0, #32]
; CHECK-NEXT:    str q0, [x0]
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    str q2, [x8]
; CHECK-NEXT:    b.ne LBB24_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    mov w0, wzr
; CHECK-NEXT:    ret
;
; CHECK-BE-LABEL: test_widening_instr_mull:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:  .LBB24_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
; CHECK-BE-NEXT:    add x8, x0, #16
; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
; CHECK-BE-NEXT:    add x9, x0, #48
; CHECK-BE-NEXT:    add x10, x0, #32
; CHECK-BE-NEXT:    subs w2, w2, #1
; CHECK-BE-NEXT:    add x1, x1, #16
; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
; CHECK-BE-NEXT:    umull v4.4s, v1.4h, v2.4h
; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
; CHECK-BE-NEXT:    mov x0, x8
; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
; CHECK-BE-NEXT:    b.ne .LBB24_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    mov w0, wzr
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
  %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
  %l1 = load <16 x i16>, ptr %gep.1
  %z2 = zext <16 x i16> %l1 to <16 x i32>
  %l4 = load <16 x i8>, ptr %gep.2
  %z5 = zext <16 x i8> %l4 to <16 x i32>
  %mul = mul <16 x i32> %z2, %z5
  store <16 x i32> %mul, ptr %gep.1
  %iv.next= add nuw nsw i32 %iv, 1
  %exitcond.not = icmp eq i32 %iv.next, %h
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret i32 0
}

define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
; CHECK-LABEL: test_widening_instr_mull_64:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh42:
; CHECK-NEXT:    adrp x8, lCPI25_0@PAGE
; CHECK-NEXT:  Lloh43:
; CHECK-NEXT:    adrp x9, lCPI25_1@PAGE
; CHECK-NEXT:  Lloh44:
; CHECK-NEXT:    adrp x10, lCPI25_3@PAGE
; CHECK-NEXT:  Lloh45:
; CHECK-NEXT:    ldr q0, [x8, lCPI25_0@PAGEOFF]
; CHECK-NEXT:  Lloh46:
; CHECK-NEXT:    adrp x8, lCPI25_2@PAGE
; CHECK-NEXT:  Lloh47:
; CHECK-NEXT:    ldr q1, [x9, lCPI25_1@PAGEOFF]
; CHECK-NEXT:  Lloh48:
; CHECK-NEXT:    ldr q2, [x8, lCPI25_2@PAGEOFF]
; CHECK-NEXT:  Lloh49:
; CHECK-NEXT:    ldr q3, [x10, lCPI25_3@PAGEOFF]
; CHECK-NEXT:    mov x8, x1
; CHECK-NEXT:  LBB25_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q4, [x0]
; CHECK-NEXT:    ldp q16, q7, [x1, #32]
; CHECK-NEXT:    ldr q18, [x8, #16]!
; CHECK-NEXT:    subs w2, w2, #1
; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
; CHECK-NEXT:    tbl.16b v6, { v4 }, v0
; CHECK-NEXT:    tbl.16b v17, { v4 }, v2
; CHECK-NEXT:    tbl.16b v4, { v4 }, v1
; CHECK-NEXT:    umull2.2d v19, v5, v7
; CHECK-NEXT:    umull.2d v5, v5, v7
; CHECK-NEXT:    ldr q7, [x1]
; CHECK-NEXT:    umull2.2d v20, v6, v16
; CHECK-NEXT:    umull2.2d v21, v17, v18
; CHECK-NEXT:    umull.2d v17, v17, v18
; CHECK-NEXT:    umull2.2d v18, v4, v7
; CHECK-NEXT:    umull.2d v4, v4, v7
; CHECK-NEXT:    mov x1, x8
; CHECK-NEXT:    stp q5, q19, [x0, #96]
; CHECK-NEXT:    umull.2d v5, v6, v16
; CHECK-NEXT:    str q20, [x0, #80]
; CHECK-NEXT:    stp q4, q18, [x0]
; CHECK-NEXT:    stp q17, q21, [x0, #32]
; CHECK-NEXT:    str q5, [x0, #64]!
; CHECK-NEXT:    b.ne LBB25_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    mov w0, wzr
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh46, Lloh48
; CHECK-NEXT:    .loh AdrpLdr Lloh44, Lloh49
; CHECK-NEXT:    .loh AdrpLdr Lloh43, Lloh47
; CHECK-NEXT:    .loh AdrpAdrp Lloh42, Lloh46
; CHECK-NEXT:    .loh AdrpLdr Lloh42, Lloh45
;
; CHECK-BE-LABEL: test_widening_instr_mull_64:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI25_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI25_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI25_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI25_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI25_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI25_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI25_3
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI25_3
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
; CHECK-BE-NEXT:  .LBB25_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ld1 { v4.16b }, [x0]
; CHECK-BE-NEXT:    add x9, x1, #48
; CHECK-BE-NEXT:    add x8, x1, #32
; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
; CHECK-BE-NEXT:    ld1 { v16.4s }, [x1]
; CHECK-BE-NEXT:    add x1, x1, #16
; CHECK-BE-NEXT:    ld1 { v20.4s }, [x8]
; CHECK-BE-NEXT:    ld1 { v22.4s }, [x1]
; CHECK-BE-NEXT:    add x8, x0, #96
; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT:    ext v24.16b, v18.16b, v18.16b, #8
; CHECK-BE-NEXT:    add x9, x0, #32
; CHECK-BE-NEXT:    ext v25.16b, v20.16b, v20.16b, #8
; CHECK-BE-NEXT:    add x10, x0, #16
; CHECK-BE-NEXT:    subs w2, w2, #1
; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
; CHECK-BE-NEXT:    rev32 v17.8b, v17.8b
; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
; CHECK-BE-NEXT:    umull v5.2d, v5.2s, v18.2s
; CHECK-BE-NEXT:    umull v18.2d, v21.2s, v22.2s
; CHECK-BE-NEXT:    ext v21.16b, v22.16b, v22.16b, #8
; CHECK-BE-NEXT:    rev32 v7.8b, v7.8b
; CHECK-BE-NEXT:    umull v22.2d, v23.2s, v16.2s
; CHECK-BE-NEXT:    ext v16.16b, v16.16b, v16.16b, #8
; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
; CHECK-BE-NEXT:    add x8, x0, #112
; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
; CHECK-BE-NEXT:    st1 { v18.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x0, #80
; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
; CHECK-BE-NEXT:    add x8, x0, #64
; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
; CHECK-BE-NEXT:    add x9, x0, #48
; CHECK-BE-NEXT:    mov x0, x8
; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
; CHECK-BE-NEXT:    b.ne .LBB25_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    mov w0, wzr
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
  %gep.1 = getelementptr inbounds <16 x i32>, ptr %p1, i32 %iv
  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
  %l1 = load <16 x i8>, ptr %gep.1
  %z2 = zext <16 x i8> %l1 to <16 x i64>
  %l4 = load <16 x i32>, ptr %gep.2
  %z5 = zext <16 x i32> %l4 to <16 x i64>
  %mul = mul <16 x i64> %z2, %z5
  store <16 x i64> %mul, ptr %gep.1
  %iv.next= add nuw nsw i32 %iv, 1
  %exitcond.not = icmp eq i32 %iv.next, %h
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret i32 0
}

define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
; CHECK-LABEL: test_widening_instr_mull_2:
; CHECK:       ; %bb.0: ; %entry
; CHECK-NEXT:  Lloh50:
; CHECK-NEXT:    adrp x8, lCPI26_0@PAGE
; CHECK-NEXT:  Lloh51:
; CHECK-NEXT:    adrp x9, lCPI26_1@PAGE
; CHECK-NEXT:  Lloh52:
; CHECK-NEXT:    adrp x10, lCPI26_3@PAGE
; CHECK-NEXT:  Lloh53:
; CHECK-NEXT:    ldr q0, [x8, lCPI26_0@PAGEOFF]
; CHECK-NEXT:  Lloh54:
; CHECK-NEXT:    adrp x8, lCPI26_2@PAGE
; CHECK-NEXT:  Lloh55:
; CHECK-NEXT:    ldr q1, [x9, lCPI26_1@PAGEOFF]
; CHECK-NEXT:  Lloh56:
; CHECK-NEXT:    ldr q2, [x8, lCPI26_2@PAGEOFF]
; CHECK-NEXT:  Lloh57:
; CHECK-NEXT:    ldr q3, [x10, lCPI26_3@PAGEOFF]
; CHECK-NEXT:    mov x8, x0
; CHECK-NEXT:  LBB26_1: ; %loop
; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr q4, [x1], #16
; CHECK-NEXT:    ldr q18, [x0]
; CHECK-NEXT:    ldp q16, q17, [x0, #32]
; CHECK-NEXT:    subs w2, w2, #1
; CHECK-NEXT:    tbl.16b v5, { v4 }, v0
; CHECK-NEXT:    tbl.16b v6, { v4 }, v1
; CHECK-NEXT:    tbl.16b v7, { v4 }, v2
; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
; CHECK-NEXT:    mul.4s v5, v16, v5
; CHECK-NEXT:    ldr q16, [x8, #16]!
; CHECK-NEXT:    mul.4s v6, v17, v6
; CHECK-NEXT:    mul.4s v7, v18, v7
; CHECK-NEXT:    mul.4s v4, v16, v4
; CHECK-NEXT:    stp q5, q6, [x0, #32]
; CHECK-NEXT:    str q7, [x0]
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    str q4, [x8]
; CHECK-NEXT:    b.ne LBB26_1
; CHECK-NEXT:  ; %bb.2: ; %exit
; CHECK-NEXT:    mov w0, wzr
; CHECK-NEXT:    ret
; CHECK-NEXT:    .loh AdrpLdr Lloh54, Lloh56
; CHECK-NEXT:    .loh AdrpLdr Lloh52, Lloh57
; CHECK-NEXT:    .loh AdrpLdr Lloh51, Lloh55
; CHECK-NEXT:    .loh AdrpAdrp Lloh50, Lloh54
; CHECK-NEXT:    .loh AdrpLdr Lloh50, Lloh53
;
; CHECK-BE-LABEL: test_widening_instr_mull_2:
; CHECK-BE:       // %bb.0: // %entry
; CHECK-BE-NEXT:    adrp x8, .LCPI26_0
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI26_0
; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI26_1
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI26_1
; CHECK-BE-NEXT:    ld1 { v1.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI26_2
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI26_2
; CHECK-BE-NEXT:    ld1 { v2.16b }, [x8]
; CHECK-BE-NEXT:    adrp x8, .LCPI26_3
; CHECK-BE-NEXT:    add x8, x8, :lo12:.LCPI26_3
; CHECK-BE-NEXT:    ld1 { v3.16b }, [x8]
; CHECK-BE-NEXT:  .LBB26_1: // %loop
; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1]
; CHECK-BE-NEXT:    add x8, x0, #32
; CHECK-BE-NEXT:    ld1 { v16.4s }, [x0]
; CHECK-BE-NEXT:    add x9, x0, #48
; CHECK-BE-NEXT:    add x10, x0, #16
; CHECK-BE-NEXT:    ld1 { v17.4s }, [x8]
; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
; CHECK-BE-NEXT:    ld1 { v19.4s }, [x10]
; CHECK-BE-NEXT:    subs w2, w2, #1
; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v1.16b
; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v3.16b
; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
; CHECK-BE-NEXT:    add x1, x1, #16
; CHECK-BE-NEXT:    rev32 v5.16b, v5.16b
; CHECK-BE-NEXT:    rev32 v6.16b, v6.16b
; CHECK-BE-NEXT:    rev32 v7.16b, v7.16b
; CHECK-BE-NEXT:    rev32 v4.16b, v4.16b
; CHECK-BE-NEXT:    mul v5.4s, v16.4s, v5.4s
; CHECK-BE-NEXT:    mul v6.4s, v17.4s, v6.4s
; CHECK-BE-NEXT:    mul v7.4s, v18.4s, v7.4s
; CHECK-BE-NEXT:    mul v4.4s, v19.4s, v4.4s
; CHECK-BE-NEXT:    st1 { v5.4s }, [x0]
; CHECK-BE-NEXT:    mov x0, x10
; CHECK-BE-NEXT:    st1 { v6.4s }, [x8]
; CHECK-BE-NEXT:    st1 { v7.4s }, [x9]
; CHECK-BE-NEXT:    st1 { v4.4s }, [x10]
; CHECK-BE-NEXT:    b.ne .LBB26_1
; CHECK-BE-NEXT:  // %bb.2: // %exit
; CHECK-BE-NEXT:    mov w0, wzr
; CHECK-BE-NEXT:    ret
entry:
  br label %loop

loop:
  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
  %gep.1 = getelementptr inbounds <16 x i8>, ptr %p1, i32 %iv
  %gep.2 = getelementptr inbounds <16 x i8>, ptr %p2, i32 %iv
  %l1 = load <16 x i32>, ptr %gep.1
  %l4 = load <16 x i8>, ptr %gep.2
  %z5 = zext <16 x i8> %l4 to <16 x i32>
  %mul = mul <16 x i32> %l1, %z5
  store <16 x i32> %mul, ptr %gep.1
  %iv.next= add nuw nsw i32 %iv, 1
  %exitcond.not = icmp eq i32 %iv.next, %h
  br i1 %exitcond.not, label %exit, label %loop

exit:
  ret i32 0
}