llvm/llvm/test/CodeGen/AArch64/rem.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI

define i8 @si8(i8 %a, i8 %b) {
; CHECK-SD-LABEL: si8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sxtb w8, w1
; CHECK-SD-NEXT:    sxtb w9, w0
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: si8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sxtb w8, w0
; CHECK-GI-NEXT:    sxtb w9, w1
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    msub w0, w8, w1, w0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem i8 %a, %b
  ret i8 %s
}

define i8 @ui8(i8 %a, i8 %b) {
; CHECK-SD-LABEL: ui8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    and w8, w1, #0xff
; CHECK-SD-NEXT:    and w9, w0, #0xff
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: ui8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    and w8, w0, #0xff
; CHECK-GI-NEXT:    and w9, w1, #0xff
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    msub w0, w8, w1, w0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem i8 %a, %b
  ret i8 %s
}

define i16 @si16(i16 %a, i16 %b) {
; CHECK-SD-LABEL: si16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sxth w8, w1
; CHECK-SD-NEXT:    sxth w9, w0
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: si16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sxth w8, w0
; CHECK-GI-NEXT:    sxth w9, w1
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    msub w0, w8, w1, w0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem i16 %a, %b
  ret i16 %s
}

define i16 @ui16(i16 %a, i16 %b) {
; CHECK-SD-LABEL: ui16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    and w8, w1, #0xffff
; CHECK-SD-NEXT:    and w9, w0, #0xffff
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: ui16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    and w8, w0, #0xffff
; CHECK-GI-NEXT:    and w9, w1, #0xffff
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    msub w0, w8, w1, w0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem i16 %a, %b
  ret i16 %s
}

define i32 @si32(i32 %a, i32 %b) {
; CHECK-LABEL: si32:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    sdiv w8, w0, w1
; CHECK-NEXT:    msub w0, w8, w1, w0
; CHECK-NEXT:    ret
entry:
  %s = srem i32 %a, %b
  ret i32 %s
}

define i32 @ui32(i32 %a, i32 %b) {
; CHECK-LABEL: ui32:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    udiv w8, w0, w1
; CHECK-NEXT:    msub w0, w8, w1, w0
; CHECK-NEXT:    ret
entry:
  %s = urem i32 %a, %b
  ret i32 %s
}

define i64 @si64(i64 %a, i64 %b) {
; CHECK-LABEL: si64:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    sdiv x8, x0, x1
; CHECK-NEXT:    msub x0, x8, x1, x0
; CHECK-NEXT:    ret
entry:
  %s = srem i64 %a, %b
  ret i64 %s
}

define i64 @ui64(i64 %a, i64 %b) {
; CHECK-LABEL: ui64:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    udiv x8, x0, x1
; CHECK-NEXT:    msub x0, x8, x1, x0
; CHECK-NEXT:    ret
entry:
  %s = urem i64 %a, %b
  ret i64 %s
}

define i128 @si128(i128 %a, i128 %b) {
; CHECK-LABEL: si128:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w30, -16
; CHECK-NEXT:    bl __modti3
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    ret
entry:
  %s = srem i128 %a, %b
  ret i128 %s
}

define i128 @ui128(i128 %a, i128 %b) {
; CHECK-LABEL: ui128:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w30, -16
; CHECK-NEXT:    bl __umodti3
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    ret
entry:
  %s = urem i128 %a, %b
  ret i128 %s
}

define <2 x i8> @sv2i8(<2 x i8> %d, <2 x i8> %e) {
; CHECK-SD-LABEL: sv2i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #24
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv2i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
; CHECK-GI-NEXT:    shl v1.2s, v1.2s, #24
; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <2 x i8> %d, %e
  ret <2 x i8> %s
}

define <3 x i8> @sv3i8(<3 x i8> %d, <3 x i8> %e) {
; CHECK-SD-LABEL: sv3i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sxtb w8, w3
; CHECK-SD-NEXT:    sxtb w9, w0
; CHECK-SD-NEXT:    sxtb w11, w4
; CHECK-SD-NEXT:    sxtb w12, w1
; CHECK-SD-NEXT:    sxtb w14, w5
; CHECK-SD-NEXT:    sxtb w15, w2
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w1, w13, w11, w12
; CHECK-SD-NEXT:    msub w2, w16, w14, w15
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv3i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sxtb w8, w0
; CHECK-GI-NEXT:    sxtb w9, w3
; CHECK-GI-NEXT:    sxtb w11, w1
; CHECK-GI-NEXT:    sxtb w12, w4
; CHECK-GI-NEXT:    sxtb w14, w2
; CHECK-GI-NEXT:    sxtb w15, w5
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    sdiv w13, w11, w12
; CHECK-GI-NEXT:    msub w0, w10, w9, w8
; CHECK-GI-NEXT:    sdiv w16, w14, w15
; CHECK-GI-NEXT:    msub w1, w13, w12, w11
; CHECK-GI-NEXT:    msub w2, w16, w15, w14
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <3 x i8> %d, %e
  ret <3 x i8> %s
}

define <4 x i8> @sv4i8(<4 x i8> %d, <4 x i8> %e) {
; CHECK-SD-LABEL: sv4i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
; CHECK-SD-NEXT:    shl v1.4h, v1.4h, #8
; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
; CHECK-SD-NEXT:    sshr v1.4h, v1.4h, #8
; CHECK-SD-NEXT:    smov w11, v1.h[0]
; CHECK-SD-NEXT:    smov w12, v0.h[0]
; CHECK-SD-NEXT:    smov w8, v1.h[1]
; CHECK-SD-NEXT:    smov w9, v0.h[1]
; CHECK-SD-NEXT:    smov w14, v1.h[2]
; CHECK-SD-NEXT:    smov w15, v0.h[2]
; CHECK-SD-NEXT:    smov w17, v1.h[3]
; CHECK-SD-NEXT:    smov w18, v0.h[3]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    sdiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.h[3], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv4i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    sdiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <4 x i8> %d, %e
  ret <4 x i8> %s
}

define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) {
; CHECK-SD-LABEL: sv8i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    smov w11, v1.b[0]
; CHECK-SD-NEXT:    smov w12, v0.b[0]
; CHECK-SD-NEXT:    smov w8, v1.b[1]
; CHECK-SD-NEXT:    smov w9, v0.b[1]
; CHECK-SD-NEXT:    smov w14, v1.b[2]
; CHECK-SD-NEXT:    smov w15, v0.b[2]
; CHECK-SD-NEXT:    smov w17, v1.b[3]
; CHECK-SD-NEXT:    smov w18, v0.b[3]
; CHECK-SD-NEXT:    smov w1, v1.b[4]
; CHECK-SD-NEXT:    smov w2, v0.b[4]
; CHECK-SD-NEXT:    smov w4, v1.b[5]
; CHECK-SD-NEXT:    smov w5, v0.b[5]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    smov w13, v1.b[7]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    smov w11, v0.b[6]
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    smov w10, v1.b[6]
; CHECK-SD-NEXT:    mov v2.b[1], w8
; CHECK-SD-NEXT:    sdiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    smov w14, v0.b[7]
; CHECK-SD-NEXT:    mov v2.b[2], w8
; CHECK-SD-NEXT:    sdiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    sdiv w9, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    sdiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w4, w5
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    sdiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    fmov d0, d2
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv8i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w10, v3.s[1]
; CHECK-GI-NEXT:    mov w11, v3.s[2]
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s1
; CHECK-GI-NEXT:    mov w14, v1.s[1]
; CHECK-GI-NEXT:    mov w15, v1.s[2]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v2.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v2.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v0.s[3]
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v2.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    sdiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s0
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    sdiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v0.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
; CHECK-GI-NEXT:    sdiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v0.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    sdiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <8 x i8> %d, %e
  ret <8 x i8> %s
}

define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
; CHECK-SD-LABEL: sv16i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    smov w11, v1.b[0]
; CHECK-SD-NEXT:    smov w12, v0.b[0]
; CHECK-SD-NEXT:    smov w8, v1.b[1]
; CHECK-SD-NEXT:    smov w9, v0.b[1]
; CHECK-SD-NEXT:    smov w14, v1.b[2]
; CHECK-SD-NEXT:    smov w15, v0.b[2]
; CHECK-SD-NEXT:    smov w17, v1.b[3]
; CHECK-SD-NEXT:    smov w18, v0.b[3]
; CHECK-SD-NEXT:    smov w1, v1.b[4]
; CHECK-SD-NEXT:    smov w2, v0.b[4]
; CHECK-SD-NEXT:    smov w4, v1.b[5]
; CHECK-SD-NEXT:    smov w5, v0.b[5]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    smov w7, v1.b[6]
; CHECK-SD-NEXT:    smov w19, v0.b[6]
; CHECK-SD-NEXT:    smov w21, v1.b[7]
; CHECK-SD-NEXT:    smov w22, v0.b[7]
; CHECK-SD-NEXT:    smov w24, v1.b[8]
; CHECK-SD-NEXT:    smov w25, v0.b[8]
; CHECK-SD-NEXT:    smov w27, v1.b[9]
; CHECK-SD-NEXT:    smov w28, v0.b[9]
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    smov w13, v1.b[11]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    smov w11, v0.b[10]
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    smov w10, v1.b[10]
; CHECK-SD-NEXT:    mov v2.b[1], w8
; CHECK-SD-NEXT:    sdiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    smov w14, v0.b[11]
; CHECK-SD-NEXT:    smov w16, v1.b[12]
; CHECK-SD-NEXT:    mov v2.b[2], w8
; CHECK-SD-NEXT:    sdiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    smov w17, v0.b[12]
; CHECK-SD-NEXT:    smov w0, v1.b[13]
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    sdiv w6, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    smov w1, v0.b[13]
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    sdiv w20, w19, w7
; CHECK-SD-NEXT:    msub w8, w6, w4, w5
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    sdiv w23, w22, w21
; CHECK-SD-NEXT:    msub w8, w20, w7, w19
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    sdiv w26, w25, w24
; CHECK-SD-NEXT:    msub w8, w23, w21, w22
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    sdiv w9, w28, w27
; CHECK-SD-NEXT:    msub w8, w26, w24, w25
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[8], w8
; CHECK-SD-NEXT:    sdiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w27, w28
; CHECK-SD-NEXT:    mov v2.b[9], w8
; CHECK-SD-NEXT:    sdiv w15, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    smov w10, v1.b[14]
; CHECK-SD-NEXT:    smov w11, v0.b[14]
; CHECK-SD-NEXT:    mov v2.b[10], w8
; CHECK-SD-NEXT:    sdiv w18, w17, w16
; CHECK-SD-NEXT:    msub w8, w15, w13, w14
; CHECK-SD-NEXT:    smov w13, v1.b[15]
; CHECK-SD-NEXT:    smov w14, v0.b[15]
; CHECK-SD-NEXT:    mov v2.b[11], w8
; CHECK-SD-NEXT:    sdiv w9, w1, w0
; CHECK-SD-NEXT:    msub w8, w18, w16, w17
; CHECK-SD-NEXT:    mov v2.b[12], w8
; CHECK-SD-NEXT:    sdiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w0, w1
; CHECK-SD-NEXT:    mov v2.b[13], w8
; CHECK-SD-NEXT:    sdiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.b[14], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.b[15], w8
; CHECK-SD-NEXT:    mov v0.16b, v2.16b
; CHECK-SD-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv16i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
; CHECK-GI-NEXT:    sshll2 v6.8h, v0.16b, #0
; CHECK-GI-NEXT:    sshll2 v7.8h, v1.16b, #0
; CHECK-GI-NEXT:    sshll v2.4s, v4.4h, #0
; CHECK-GI-NEXT:    sshll v3.4s, v5.4h, #0
; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
; CHECK-GI-NEXT:    sshll v0.4s, v6.4h, #0
; CHECK-GI-NEXT:    sshll v1.4s, v7.4h, #0
; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    fmov w17, s1
; CHECK-GI-NEXT:    mov w18, v1.s[1]
; CHECK-GI-NEXT:    mov w0, v1.s[2]
; CHECK-GI-NEXT:    mov w1, v1.s[3]
; CHECK-GI-NEXT:    sdiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[1]
; CHECK-GI-NEXT:    mov w9, v3.s[1]
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w3, v7.s[1]
; CHECK-GI-NEXT:    mov w4, v7.s[2]
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[2]
; CHECK-GI-NEXT:    mov w9, v3.s[2]
; CHECK-GI-NEXT:    mov v16.s[0], w11
; CHECK-GI-NEXT:    mov w11, v6.s[3]
; CHECK-GI-NEXT:    sdiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[3]
; CHECK-GI-NEXT:    mov v16.s[1], w10
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v16.s[2], w9
; CHECK-GI-NEXT:    sdiv w14, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov w13, v5.s[1]
; CHECK-GI-NEXT:    mov v16.s[3], w8
; CHECK-GI-NEXT:    mls v2.4s, v16.4s, v3.4s
; CHECK-GI-NEXT:    sdiv w15, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov w13, v5.s[2]
; CHECK-GI-NEXT:    mov v17.s[0], w14
; CHECK-GI-NEXT:    mov w14, v7.s[3]
; CHECK-GI-NEXT:    sdiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    mov v17.s[1], w15
; CHECK-GI-NEXT:    sdiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s0
; CHECK-GI-NEXT:    mov v17.s[2], w13
; CHECK-GI-NEXT:    sdiv w16, w16, w17
; CHECK-GI-NEXT:    mov w17, v0.s[1]
; CHECK-GI-NEXT:    mov v17.s[3], w12
; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
; CHECK-GI-NEXT:    sdiv w17, w17, w18
; CHECK-GI-NEXT:    mov w18, v0.s[2]
; CHECK-GI-NEXT:    mov v18.s[0], w16
; CHECK-GI-NEXT:    sdiv w18, w18, w0
; CHECK-GI-NEXT:    mov w0, v0.s[3]
; CHECK-GI-NEXT:    mov v18.s[1], w17
; CHECK-GI-NEXT:    sdiv w0, w0, w1
; CHECK-GI-NEXT:    fmov w1, s6
; CHECK-GI-NEXT:    mov v18.s[2], w18
; CHECK-GI-NEXT:    sdiv w1, w1, w2
; CHECK-GI-NEXT:    mov w2, v6.s[1]
; CHECK-GI-NEXT:    mov v18.s[3], w0
; CHECK-GI-NEXT:    mls v0.4s, v18.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
; CHECK-GI-NEXT:    sdiv w2, w2, w3
; CHECK-GI-NEXT:    mov w3, v6.s[2]
; CHECK-GI-NEXT:    mov v19.s[0], w1
; CHECK-GI-NEXT:    sdiv w3, w3, w4
; CHECK-GI-NEXT:    mov v19.s[1], w2
; CHECK-GI-NEXT:    sdiv w10, w11, w14
; CHECK-GI-NEXT:    mov v19.s[2], w3
; CHECK-GI-NEXT:    mov v19.s[3], w10
; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v6.8h
; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <16 x i8> %d, %e
  ret <16 x i8> %s
}

define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
; CHECK-SD-LABEL: sv32i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #304
; CHECK-SD-NEXT:    stp x29, x30, [sp, #208] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #224] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #240] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 304
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    smov w8, v2.b[1]
; CHECK-SD-NEXT:    smov w9, v0.b[1]
; CHECK-SD-NEXT:    smov w19, v3.b[7]
; CHECK-SD-NEXT:    smov w7, v1.b[7]
; CHECK-SD-NEXT:    smov w6, v3.b[8]
; CHECK-SD-NEXT:    smov w3, v1.b[8]
; CHECK-SD-NEXT:    smov w13, v3.b[0]
; CHECK-SD-NEXT:    smov w5, v3.b[1]
; CHECK-SD-NEXT:    smov w0, v1.b[1]
; CHECK-SD-NEXT:    smov w12, v3.b[2]
; CHECK-SD-NEXT:    smov w17, v3.b[3]
; CHECK-SD-NEXT:    smov w16, v1.b[3]
; CHECK-SD-NEXT:    str w8, [sp, #80] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[0]
; CHECK-SD-NEXT:    str w9, [sp, #88] // 4-byte Folded Spill
; CHECK-SD-NEXT:    smov w9, v0.b[0]
; CHECK-SD-NEXT:    ldr w30, [sp, #80] // 4-byte Folded Reload
; CHECK-SD-NEXT:    smov w15, v3.b[4]
; CHECK-SD-NEXT:    smov w14, v1.b[4]
; CHECK-SD-NEXT:    smov w4, v3.b[5]
; CHECK-SD-NEXT:    smov w1, v1.b[5]
; CHECK-SD-NEXT:    smov w2, v3.b[6]
; CHECK-SD-NEXT:    smov w18, v1.b[6]
; CHECK-SD-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
; CHECK-SD-NEXT:    smov w21, v3.b[9]
; CHECK-SD-NEXT:    smov w20, v1.b[9]
; CHECK-SD-NEXT:    str w9, [sp, #40] // 4-byte Folded Spill
; CHECK-SD-NEXT:    ldr w29, [sp, #32] // 4-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w11, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[2]
; CHECK-SD-NEXT:    smov w9, v0.b[2]
; CHECK-SD-NEXT:    str w10, [sp, #96] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[3]
; CHECK-SD-NEXT:    smov w9, v0.b[3]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #48] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #24] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[4]
; CHECK-SD-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w9, w10, [sp, #56] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w9, v0.b[4]
; CHECK-SD-NEXT:    sdiv w27, w0, w5
; CHECK-SD-NEXT:    str w9, [sp, #36] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[5]
; CHECK-SD-NEXT:    smov w9, v0.b[5]
; CHECK-SD-NEXT:    str w8, [sp, #76] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w9, [sp, #84] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[6]
; CHECK-SD-NEXT:    smov w9, v0.b[6]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #92] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[7]
; CHECK-SD-NEXT:    smov w9, v0.b[7]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w11, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[8]
; CHECK-SD-NEXT:    smov w9, v0.b[8]
; CHECK-SD-NEXT:    str w10, [sp, #72] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w8, w9, [sp, #100] // 8-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[9]
; CHECK-SD-NEXT:    smov w9, v0.b[9]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #108] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[10]
; CHECK-SD-NEXT:    smov w9, v0.b[10]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #120] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #144] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[11]
; CHECK-SD-NEXT:    stp w9, w10, [sp, #128] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w9, v0.b[11]
; CHECK-SD-NEXT:    sdiv w25, w16, w17
; CHECK-SD-NEXT:    stp w8, w9, [sp, #172] // 8-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w11, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[12]
; CHECK-SD-NEXT:    smov w9, v0.b[12]
; CHECK-SD-NEXT:    str w8, [sp, #152] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w9, [sp, #160] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[13]
; CHECK-SD-NEXT:    smov w9, v0.b[13]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #196] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #168] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[14]
; CHECK-SD-NEXT:    smov w9, v0.b[14]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #180] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w11, v1.b[2]
; CHECK-SD-NEXT:    str w10, [sp, #204] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.b[15]
; CHECK-SD-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w9, w10, [sp, #188] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w9, v0.b[15]
; CHECK-SD-NEXT:    sdiv w22, w11, w12
; CHECK-SD-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    str w10, [sp, #164] // 4-byte Folded Spill
; CHECK-SD-NEXT:    smov w10, v1.b[0]
; CHECK-SD-NEXT:    sdiv w9, w7, w19
; CHECK-SD-NEXT:    sdiv w8, w3, w6
; CHECK-SD-NEXT:    sdiv w23, w10, w13
; CHECK-SD-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
; CHECK-SD-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w9, [sp, #88] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w9, w8, w30, w9
; CHECK-SD-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w30, [sp, #40] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w8, w8, w29, w30
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #208] // 16-byte Folded Reload
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w10, w23, w13, w10
; CHECK-SD-NEXT:    sdiv w24, w14, w15
; CHECK-SD-NEXT:    msub w13, w27, w5, w0
; CHECK-SD-NEXT:    ldr w5, [sp, #16] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[1], w9
; CHECK-SD-NEXT:    msub w9, w22, w12, w11
; CHECK-SD-NEXT:    smov w11, v1.b[10]
; CHECK-SD-NEXT:    fmov s2, w10
; CHECK-SD-NEXT:    ldp w10, w8, [sp, #20] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[1], w13
; CHECK-SD-NEXT:    msub w8, w8, w5, w10
; CHECK-SD-NEXT:    ldr w5, [sp, #52] // 4-byte Folded Reload
; CHECK-SD-NEXT:    smov w10, v3.b[10]
; CHECK-SD-NEXT:    sdiv w28, w1, w4
; CHECK-SD-NEXT:    ldp w13, w12, [sp, #56] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[2], w9
; CHECK-SD-NEXT:    mov v0.b[2], w8
; CHECK-SD-NEXT:    msub w8, w25, w17, w16
; CHECK-SD-NEXT:    ldr w17, [sp, #28] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w16, [sp, #36] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w12, w12, w5, w13
; CHECK-SD-NEXT:    ldr w13, [sp, #44] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w5, [sp, #136] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    msub w8, w24, w15, w14
; CHECK-SD-NEXT:    ldr w15, [sp, #92] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[3], w12
; CHECK-SD-NEXT:    msub w13, w13, w17, w16
; CHECK-SD-NEXT:    ldr w17, [sp, #76] // 4-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w26, w18, w2
; CHECK-SD-NEXT:    ldr w16, [sp, #84] // 4-byte Folded Reload
; CHECK-SD-NEXT:    smov w12, v3.b[11]
; CHECK-SD-NEXT:    msub w15, w15, w17, w16
; CHECK-SD-NEXT:    smov w14, v1.b[11]
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    msub w8, w28, w4, w1
; CHECK-SD-NEXT:    ldr w1, [sp, #64] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[4], w13
; CHECK-SD-NEXT:    ldr w4, [sp, #100] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w17, w16, [sp, #68] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #224] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[5], w15
; CHECK-SD-NEXT:    msub w16, w16, w1, w17
; CHECK-SD-NEXT:    smov w15, v3.b[12]
; CHECK-SD-NEXT:    msub w8, w26, w2, w18
; CHECK-SD-NEXT:    ldr w2, [sp, #112] // 4-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w0, w20, w21
; CHECK-SD-NEXT:    ldp w1, w18, [sp, #116] // 8-byte Folded Reload
; CHECK-SD-NEXT:    smov w17, v1.b[12]
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #240] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[6], w16
; CHECK-SD-NEXT:    msub w18, w18, w2, w1
; CHECK-SD-NEXT:    msub w8, w8, w19, w7
; CHECK-SD-NEXT:    ldp w2, w1, [sp, #104] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[7], w18
; CHECK-SD-NEXT:    smov w18, v3.b[13]
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w9, w11, w10
; CHECK-SD-NEXT:    msub w1, w1, w4, w2
; CHECK-SD-NEXT:    smov w2, v1.b[13]
; CHECK-SD-NEXT:    msub w8, w8, w6, w3
; CHECK-SD-NEXT:    ldp w4, w3, [sp, #140] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[8], w1
; CHECK-SD-NEXT:    mov v2.b[8], w8
; CHECK-SD-NEXT:    msub w8, w0, w21, w20
; CHECK-SD-NEXT:    msub w3, w3, w5, w4
; CHECK-SD-NEXT:    ldr w5, [sp, #124] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w4, w1, [sp, #128] // 8-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w13, w14, w12
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[9], w8
; CHECK-SD-NEXT:    mov v0.b[9], w3
; CHECK-SD-NEXT:    msub w8, w9, w10, w11
; CHECK-SD-NEXT:    msub w1, w1, w5, w4
; CHECK-SD-NEXT:    ldr w4, [sp, #172] // 4-byte Folded Reload
; CHECK-SD-NEXT:    smov w9, v3.b[14]
; CHECK-SD-NEXT:    ldp w3, w11, [sp, #176] // 8-byte Folded Reload
; CHECK-SD-NEXT:    smov w10, v1.b[14]
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[10], w8
; CHECK-SD-NEXT:    mov v0.b[10], w1
; CHECK-SD-NEXT:    ldr w1, [sp, #152] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w11, w11, w4, w3
; CHECK-SD-NEXT:    sdiv w16, w17, w15
; CHECK-SD-NEXT:    msub w8, w13, w12, w14
; CHECK-SD-NEXT:    ldr w13, [sp, #168] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w14, [sp, #160] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[11], w11
; CHECK-SD-NEXT:    smov w11, v3.b[15]
; CHECK-SD-NEXT:    msub w13, w13, w1, w14
; CHECK-SD-NEXT:    smov w14, v1.b[15]
; CHECK-SD-NEXT:    mov v2.b[11], w8
; CHECK-SD-NEXT:    mov v0.b[12], w13
; CHECK-SD-NEXT:    sdiv w0, w2, w18
; CHECK-SD-NEXT:    msub w8, w16, w15, w17
; CHECK-SD-NEXT:    ldr w17, [sp, #196] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w16, w15, [sp, #200] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[12], w8
; CHECK-SD-NEXT:    msub w15, w15, w17, w16
; CHECK-SD-NEXT:    ldp w17, w16, [sp, #188] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[13], w15
; CHECK-SD-NEXT:    sdiv w12, w10, w9
; CHECK-SD-NEXT:    msub w8, w0, w18, w2
; CHECK-SD-NEXT:    ldr w18, [sp, #184] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w16, w16, w18, w17
; CHECK-SD-NEXT:    mov v2.b[13], w8
; CHECK-SD-NEXT:    mov v0.b[14], w16
; CHECK-SD-NEXT:    sdiv w13, w14, w11
; CHECK-SD-NEXT:    msub w8, w12, w9, w10
; CHECK-SD-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w12, [sp, #148] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w10, [sp, #156] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[14], w8
; CHECK-SD-NEXT:    msub w9, w9, w12, w10
; CHECK-SD-NEXT:    mov v0.b[15], w9
; CHECK-SD-NEXT:    msub w8, w13, w11, w14
; CHECK-SD-NEXT:    mov v2.b[15], w8
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    add sp, sp, #304
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv32i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -88
; CHECK-GI-NEXT:    .cfi_offset w29, -96
; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
; CHECK-GI-NEXT:    sshll v5.8h, v2.8b, #0
; CHECK-GI-NEXT:    sshll v16.8h, v1.8b, #0
; CHECK-GI-NEXT:    sshll v17.8h, v3.8b, #0
; CHECK-GI-NEXT:    sshll v6.4s, v4.4h, #0
; CHECK-GI-NEXT:    sshll v7.4s, v5.4h, #0
; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
; CHECK-GI-NEXT:    sshll v18.4s, v16.4h, #0
; CHECK-GI-NEXT:    sshll v19.4s, v17.4h, #0
; CHECK-GI-NEXT:    sshll2 v16.4s, v16.8h, #0
; CHECK-GI-NEXT:    sshll2 v17.4s, v17.8h, #0
; CHECK-GI-NEXT:    fmov w8, s6
; CHECK-GI-NEXT:    fmov w9, s7
; CHECK-GI-NEXT:    mov w12, v7.s[3]
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    fmov w6, s19
; CHECK-GI-NEXT:    mov w7, v19.s[3]
; CHECK-GI-NEXT:    fmov w21, s17
; CHECK-GI-NEXT:    mov w23, v17.s[3]
; CHECK-GI-NEXT:    sdiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[1]
; CHECK-GI-NEXT:    mov w9, v7.s[1]
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[2]
; CHECK-GI-NEXT:    mov w9, v7.s[2]
; CHECK-GI-NEXT:    mov v20.s[0], w11
; CHECK-GI-NEXT:    sdiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[3]
; CHECK-GI-NEXT:    sshll2 v6.8h, v0.16b, #0
; CHECK-GI-NEXT:    mov v20.s[1], w10
; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
; CHECK-GI-NEXT:    sshll v28.4s, v0.4h, #0
; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v20.s[2], w9
; CHECK-GI-NEXT:    sdiv w15, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov w13, v5.s[1]
; CHECK-GI-NEXT:    mov v20.s[3], w8
; CHECK-GI-NEXT:    sdiv w14, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov w13, v5.s[2]
; CHECK-GI-NEXT:    sshll v5.4s, v6.4h, #0
; CHECK-GI-NEXT:    mov v21.s[0], w15
; CHECK-GI-NEXT:    sdiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    sshll2 v4.8h, v2.16b, #0
; CHECK-GI-NEXT:    mov v21.s[1], w14
; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
; CHECK-GI-NEXT:    sshll v7.4s, v4.4h, #0
; CHECK-GI-NEXT:    sshll v30.4s, v2.4h, #0
; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
; CHECK-GI-NEXT:    fmov w17, s7
; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
; CHECK-GI-NEXT:    sdiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s5
; CHECK-GI-NEXT:    mov v21.s[2], w13
; CHECK-GI-NEXT:    sdiv w1, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[1]
; CHECK-GI-NEXT:    mov w17, v7.s[1]
; CHECK-GI-NEXT:    mov v21.s[3], w12
; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
; CHECK-GI-NEXT:    sdiv w0, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[2]
; CHECK-GI-NEXT:    mov w17, v7.s[2]
; CHECK-GI-NEXT:    mov v22.s[0], w1
; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
; CHECK-GI-NEXT:    sdiv w18, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    mov w17, v7.s[3]
; CHECK-GI-NEXT:    sshll2 v5.4s, v6.8h, #0
; CHECK-GI-NEXT:    sshll2 v7.4s, v4.8h, #0
; CHECK-GI-NEXT:    mov v22.s[1], w0
; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w4, v7.s[3]
; CHECK-GI-NEXT:    sdiv w16, w16, w17
; CHECK-GI-NEXT:    fmov w17, s5
; CHECK-GI-NEXT:    mov v22.s[2], w18
; CHECK-GI-NEXT:    sdiv w5, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[1]
; CHECK-GI-NEXT:    mov w2, v7.s[1]
; CHECK-GI-NEXT:    mov v22.s[3], w16
; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
; CHECK-GI-NEXT:    sdiv w3, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[2]
; CHECK-GI-NEXT:    mov w2, v7.s[2]
; CHECK-GI-NEXT:    mov v23.s[0], w5
; CHECK-GI-NEXT:    sdiv w2, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[3]
; CHECK-GI-NEXT:    mov v23.s[1], w3
; CHECK-GI-NEXT:    sdiv w17, w17, w4
; CHECK-GI-NEXT:    fmov w4, s18
; CHECK-GI-NEXT:    mov v23.s[2], w2
; CHECK-GI-NEXT:    sdiv w20, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[1]
; CHECK-GI-NEXT:    mov w6, v19.s[1]
; CHECK-GI-NEXT:    mov v23.s[3], w17
; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
; CHECK-GI-NEXT:    sdiv w19, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[2]
; CHECK-GI-NEXT:    mov w6, v19.s[2]
; CHECK-GI-NEXT:    mov v24.s[0], w20
; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT:    sdiv w6, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[3]
; CHECK-GI-NEXT:    mov v24.s[1], w19
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    sdiv w4, w4, w7
; CHECK-GI-NEXT:    fmov w7, s16
; CHECK-GI-NEXT:    mov v24.s[2], w6
; CHECK-GI-NEXT:    sdiv w24, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[1]
; CHECK-GI-NEXT:    mov w21, v17.s[1]
; CHECK-GI-NEXT:    mov v24.s[3], w4
; CHECK-GI-NEXT:    sdiv w22, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[2]
; CHECK-GI-NEXT:    mov w21, v17.s[2]
; CHECK-GI-NEXT:    sshll2 v17.8h, v1.16b, #0
; CHECK-GI-NEXT:    mov v25.s[0], w24
; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
; CHECK-GI-NEXT:    sshll v18.4s, v17.4h, #0
; CHECK-GI-NEXT:    sshll v29.4s, v1.4h, #0
; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    sdiv w21, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[3]
; CHECK-GI-NEXT:    sshll2 v16.8h, v3.16b, #0
; CHECK-GI-NEXT:    mov v25.s[1], w22
; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
; CHECK-GI-NEXT:    sshll v19.4s, v16.4h, #0
; CHECK-GI-NEXT:    sshll v31.4s, v3.4h, #0
; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
; CHECK-GI-NEXT:    fmov w25, s19
; CHECK-GI-NEXT:    mov w26, v19.s[1]
; CHECK-GI-NEXT:    mov w27, v19.s[2]
; CHECK-GI-NEXT:    mov w28, v19.s[3]
; CHECK-GI-NEXT:    sshll2 v19.4s, v16.8h, #0
; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
; CHECK-GI-NEXT:    sdiv w7, w7, w23
; CHECK-GI-NEXT:    fmov w23, s18
; CHECK-GI-NEXT:    mov v25.s[2], w21
; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    fmov w29, s19
; CHECK-GI-NEXT:    mov w30, v19.s[1]
; CHECK-GI-NEXT:    mov w15, v19.s[2]
; CHECK-GI-NEXT:    sdiv w25, w23, w25
; CHECK-GI-NEXT:    mov w23, v18.s[1]
; CHECK-GI-NEXT:    mov v25.s[3], w7
; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
; CHECK-GI-NEXT:    sdiv w26, w23, w26
; CHECK-GI-NEXT:    mov w23, v18.s[2]
; CHECK-GI-NEXT:    mov v26.s[0], w25
; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
; CHECK-GI-NEXT:    sdiv w27, w23, w27
; CHECK-GI-NEXT:    mov w23, v18.s[3]
; CHECK-GI-NEXT:    sshll2 v18.4s, v17.8h, #0
; CHECK-GI-NEXT:    mov v26.s[1], w26
; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    mov w11, v18.s[2]
; CHECK-GI-NEXT:    mov w9, v18.s[3]
; CHECK-GI-NEXT:    sdiv w23, w23, w28
; CHECK-GI-NEXT:    fmov w28, s18
; CHECK-GI-NEXT:    mov v26.s[2], w27
; CHECK-GI-NEXT:    sdiv w28, w28, w29
; CHECK-GI-NEXT:    mov w29, v18.s[1]
; CHECK-GI-NEXT:    mov v26.s[3], w23
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
; CHECK-GI-NEXT:    sdiv w29, w29, w30
; CHECK-GI-NEXT:    mov v27.s[0], w28
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    sdiv w10, w11, w15
; CHECK-GI-NEXT:    mov w11, v19.s[3]
; CHECK-GI-NEXT:    mov v27.s[1], w29
; CHECK-GI-NEXT:    sdiv w8, w9, w11
; CHECK-GI-NEXT:    mov v27.s[2], w10
; CHECK-GI-NEXT:    mov v27.s[3], w8
; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <32 x i8> %d, %e
  ret <32 x i8> %s
}

define <2 x i8> @uv2i8(<2 x i8> %d, <2 x i8> %e) {
; CHECK-SD-LABEL: uv2i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv2i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    movi d2, #0x0000ff000000ff
; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <2 x i8> %d, %e
  ret <2 x i8> %s
}

define <3 x i8> @uv3i8(<3 x i8> %d, <3 x i8> %e) {
; CHECK-SD-LABEL: uv3i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    and w8, w3, #0xff
; CHECK-SD-NEXT:    and w9, w0, #0xff
; CHECK-SD-NEXT:    and w11, w4, #0xff
; CHECK-SD-NEXT:    and w12, w1, #0xff
; CHECK-SD-NEXT:    and w14, w5, #0xff
; CHECK-SD-NEXT:    and w15, w2, #0xff
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    msub w0, w10, w8, w9
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w1, w13, w11, w12
; CHECK-SD-NEXT:    msub w2, w16, w14, w15
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv3i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    and w8, w0, #0xff
; CHECK-GI-NEXT:    and w9, w3, #0xff
; CHECK-GI-NEXT:    and w11, w1, #0xff
; CHECK-GI-NEXT:    and w12, w4, #0xff
; CHECK-GI-NEXT:    and w14, w2, #0xff
; CHECK-GI-NEXT:    and w15, w5, #0xff
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    udiv w13, w11, w12
; CHECK-GI-NEXT:    msub w0, w10, w9, w8
; CHECK-GI-NEXT:    udiv w16, w14, w15
; CHECK-GI-NEXT:    msub w1, w13, w12, w11
; CHECK-GI-NEXT:    msub w2, w16, w15, w14
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <3 x i8> %d, %e
  ret <3 x i8> %s
}

define <4 x i8> @uv4i8(<4 x i8> %d, <4 x i8> %e) {
; CHECK-SD-LABEL: uv4i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT:    umov w11, v1.h[0]
; CHECK-SD-NEXT:    umov w12, v0.h[0]
; CHECK-SD-NEXT:    umov w8, v1.h[1]
; CHECK-SD-NEXT:    umov w9, v0.h[1]
; CHECK-SD-NEXT:    umov w14, v1.h[2]
; CHECK-SD-NEXT:    umov w15, v0.h[2]
; CHECK-SD-NEXT:    umov w17, v1.h[3]
; CHECK-SD-NEXT:    umov w18, v0.h[3]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    udiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.h[3], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv4i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    udiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <4 x i8> %d, %e
  ret <4 x i8> %s
}

define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) {
; CHECK-SD-LABEL: uv8i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    umov w11, v1.b[0]
; CHECK-SD-NEXT:    umov w12, v0.b[0]
; CHECK-SD-NEXT:    umov w8, v1.b[1]
; CHECK-SD-NEXT:    umov w9, v0.b[1]
; CHECK-SD-NEXT:    umov w14, v1.b[2]
; CHECK-SD-NEXT:    umov w15, v0.b[2]
; CHECK-SD-NEXT:    umov w17, v1.b[3]
; CHECK-SD-NEXT:    umov w18, v0.b[3]
; CHECK-SD-NEXT:    umov w1, v1.b[4]
; CHECK-SD-NEXT:    umov w2, v0.b[4]
; CHECK-SD-NEXT:    umov w4, v1.b[5]
; CHECK-SD-NEXT:    umov w5, v0.b[5]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    umov w13, v1.b[7]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    umov w11, v0.b[6]
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    umov w10, v1.b[6]
; CHECK-SD-NEXT:    mov v2.b[1], w8
; CHECK-SD-NEXT:    udiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    umov w14, v0.b[7]
; CHECK-SD-NEXT:    mov v2.b[2], w8
; CHECK-SD-NEXT:    udiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    udiv w9, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    udiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w4, w5
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    udiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    fmov d0, d2
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv8i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w10, v3.s[1]
; CHECK-GI-NEXT:    mov w11, v3.s[2]
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s1
; CHECK-GI-NEXT:    mov w14, v1.s[1]
; CHECK-GI-NEXT:    mov w15, v1.s[2]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v2.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v2.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v0.s[3]
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v2.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    udiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s0
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    udiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v0.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
; CHECK-GI-NEXT:    udiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v0.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    udiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <8 x i8> %d, %e
  ret <8 x i8> %s
}

define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
; CHECK-SD-LABEL: uv16i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    umov w11, v1.b[0]
; CHECK-SD-NEXT:    umov w12, v0.b[0]
; CHECK-SD-NEXT:    umov w8, v1.b[1]
; CHECK-SD-NEXT:    umov w9, v0.b[1]
; CHECK-SD-NEXT:    umov w14, v1.b[2]
; CHECK-SD-NEXT:    umov w15, v0.b[2]
; CHECK-SD-NEXT:    umov w17, v1.b[3]
; CHECK-SD-NEXT:    umov w18, v0.b[3]
; CHECK-SD-NEXT:    umov w1, v1.b[4]
; CHECK-SD-NEXT:    umov w2, v0.b[4]
; CHECK-SD-NEXT:    umov w4, v1.b[5]
; CHECK-SD-NEXT:    umov w5, v0.b[5]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    umov w7, v1.b[6]
; CHECK-SD-NEXT:    umov w19, v0.b[6]
; CHECK-SD-NEXT:    umov w21, v1.b[7]
; CHECK-SD-NEXT:    umov w22, v0.b[7]
; CHECK-SD-NEXT:    umov w24, v1.b[8]
; CHECK-SD-NEXT:    umov w25, v0.b[8]
; CHECK-SD-NEXT:    umov w27, v1.b[9]
; CHECK-SD-NEXT:    umov w28, v0.b[9]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    umov w13, v1.b[11]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    umov w11, v0.b[10]
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    umov w10, v1.b[10]
; CHECK-SD-NEXT:    mov v2.b[1], w8
; CHECK-SD-NEXT:    udiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    umov w14, v0.b[11]
; CHECK-SD-NEXT:    umov w16, v1.b[12]
; CHECK-SD-NEXT:    mov v2.b[2], w8
; CHECK-SD-NEXT:    udiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    umov w17, v0.b[12]
; CHECK-SD-NEXT:    umov w0, v1.b[13]
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    udiv w6, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    umov w1, v0.b[13]
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    udiv w20, w19, w7
; CHECK-SD-NEXT:    msub w8, w6, w4, w5
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    udiv w23, w22, w21
; CHECK-SD-NEXT:    msub w8, w20, w7, w19
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    udiv w26, w25, w24
; CHECK-SD-NEXT:    msub w8, w23, w21, w22
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    udiv w9, w28, w27
; CHECK-SD-NEXT:    msub w8, w26, w24, w25
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[8], w8
; CHECK-SD-NEXT:    udiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w27, w28
; CHECK-SD-NEXT:    mov v2.b[9], w8
; CHECK-SD-NEXT:    udiv w15, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    umov w10, v1.b[14]
; CHECK-SD-NEXT:    umov w11, v0.b[14]
; CHECK-SD-NEXT:    mov v2.b[10], w8
; CHECK-SD-NEXT:    udiv w18, w17, w16
; CHECK-SD-NEXT:    msub w8, w15, w13, w14
; CHECK-SD-NEXT:    umov w13, v1.b[15]
; CHECK-SD-NEXT:    umov w14, v0.b[15]
; CHECK-SD-NEXT:    mov v2.b[11], w8
; CHECK-SD-NEXT:    udiv w9, w1, w0
; CHECK-SD-NEXT:    msub w8, w18, w16, w17
; CHECK-SD-NEXT:    mov v2.b[12], w8
; CHECK-SD-NEXT:    udiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w0, w1
; CHECK-SD-NEXT:    mov v2.b[13], w8
; CHECK-SD-NEXT:    udiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.b[14], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.b[15], w8
; CHECK-SD-NEXT:    mov v0.16b, v2.16b
; CHECK-SD-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv16i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v4.8h, v0.8b, #0
; CHECK-GI-NEXT:    ushll v5.8h, v1.8b, #0
; CHECK-GI-NEXT:    ushll2 v6.8h, v0.16b, #0
; CHECK-GI-NEXT:    ushll2 v7.8h, v1.16b, #0
; CHECK-GI-NEXT:    ushll v2.4s, v4.4h, #0
; CHECK-GI-NEXT:    ushll v3.4s, v5.4h, #0
; CHECK-GI-NEXT:    ushll2 v4.4s, v4.8h, #0
; CHECK-GI-NEXT:    ushll2 v5.4s, v5.8h, #0
; CHECK-GI-NEXT:    ushll v0.4s, v6.4h, #0
; CHECK-GI-NEXT:    ushll v1.4s, v7.4h, #0
; CHECK-GI-NEXT:    ushll2 v6.4s, v6.8h, #0
; CHECK-GI-NEXT:    ushll2 v7.4s, v7.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    fmov w17, s1
; CHECK-GI-NEXT:    mov w18, v1.s[1]
; CHECK-GI-NEXT:    mov w0, v1.s[2]
; CHECK-GI-NEXT:    mov w1, v1.s[3]
; CHECK-GI-NEXT:    udiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[1]
; CHECK-GI-NEXT:    mov w9, v3.s[1]
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w3, v7.s[1]
; CHECK-GI-NEXT:    mov w4, v7.s[2]
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[2]
; CHECK-GI-NEXT:    mov w9, v3.s[2]
; CHECK-GI-NEXT:    mov v16.s[0], w11
; CHECK-GI-NEXT:    mov w11, v6.s[3]
; CHECK-GI-NEXT:    udiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v2.s[3]
; CHECK-GI-NEXT:    mov v16.s[1], w10
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v16.s[2], w9
; CHECK-GI-NEXT:    udiv w14, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov w13, v5.s[1]
; CHECK-GI-NEXT:    mov v16.s[3], w8
; CHECK-GI-NEXT:    mls v2.4s, v16.4s, v3.4s
; CHECK-GI-NEXT:    udiv w15, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov w13, v5.s[2]
; CHECK-GI-NEXT:    mov v17.s[0], w14
; CHECK-GI-NEXT:    mov w14, v7.s[3]
; CHECK-GI-NEXT:    udiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    mov v17.s[1], w15
; CHECK-GI-NEXT:    udiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s0
; CHECK-GI-NEXT:    mov v17.s[2], w13
; CHECK-GI-NEXT:    udiv w16, w16, w17
; CHECK-GI-NEXT:    mov w17, v0.s[1]
; CHECK-GI-NEXT:    mov v17.s[3], w12
; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
; CHECK-GI-NEXT:    udiv w17, w17, w18
; CHECK-GI-NEXT:    mov w18, v0.s[2]
; CHECK-GI-NEXT:    mov v18.s[0], w16
; CHECK-GI-NEXT:    udiv w18, w18, w0
; CHECK-GI-NEXT:    mov w0, v0.s[3]
; CHECK-GI-NEXT:    mov v18.s[1], w17
; CHECK-GI-NEXT:    udiv w0, w0, w1
; CHECK-GI-NEXT:    fmov w1, s6
; CHECK-GI-NEXT:    mov v18.s[2], w18
; CHECK-GI-NEXT:    udiv w1, w1, w2
; CHECK-GI-NEXT:    mov w2, v6.s[1]
; CHECK-GI-NEXT:    mov v18.s[3], w0
; CHECK-GI-NEXT:    mls v0.4s, v18.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
; CHECK-GI-NEXT:    udiv w2, w2, w3
; CHECK-GI-NEXT:    mov w3, v6.s[2]
; CHECK-GI-NEXT:    mov v19.s[0], w1
; CHECK-GI-NEXT:    udiv w3, w3, w4
; CHECK-GI-NEXT:    mov v19.s[1], w2
; CHECK-GI-NEXT:    udiv w10, w11, w14
; CHECK-GI-NEXT:    mov v19.s[2], w3
; CHECK-GI-NEXT:    mov v19.s[3], w10
; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v6.8h
; CHECK-GI-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <16 x i8> %d, %e
  ret <16 x i8> %s
}

define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
; CHECK-SD-LABEL: uv32i8:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #304
; CHECK-SD-NEXT:    stp x29, x30, [sp, #208] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #224] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #240] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 304
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    umov w8, v2.b[1]
; CHECK-SD-NEXT:    umov w9, v0.b[1]
; CHECK-SD-NEXT:    umov w19, v3.b[7]
; CHECK-SD-NEXT:    umov w7, v1.b[7]
; CHECK-SD-NEXT:    umov w6, v3.b[8]
; CHECK-SD-NEXT:    umov w3, v1.b[8]
; CHECK-SD-NEXT:    umov w13, v3.b[0]
; CHECK-SD-NEXT:    umov w5, v3.b[1]
; CHECK-SD-NEXT:    umov w0, v1.b[1]
; CHECK-SD-NEXT:    umov w12, v3.b[2]
; CHECK-SD-NEXT:    umov w17, v3.b[3]
; CHECK-SD-NEXT:    umov w16, v1.b[3]
; CHECK-SD-NEXT:    str w8, [sp, #80] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[0]
; CHECK-SD-NEXT:    str w9, [sp, #88] // 4-byte Folded Spill
; CHECK-SD-NEXT:    umov w9, v0.b[0]
; CHECK-SD-NEXT:    ldr w30, [sp, #80] // 4-byte Folded Reload
; CHECK-SD-NEXT:    umov w15, v3.b[4]
; CHECK-SD-NEXT:    umov w14, v1.b[4]
; CHECK-SD-NEXT:    umov w4, v3.b[5]
; CHECK-SD-NEXT:    umov w1, v1.b[5]
; CHECK-SD-NEXT:    umov w2, v3.b[6]
; CHECK-SD-NEXT:    umov w18, v1.b[6]
; CHECK-SD-NEXT:    str w8, [sp, #32] // 4-byte Folded Spill
; CHECK-SD-NEXT:    umov w21, v3.b[9]
; CHECK-SD-NEXT:    umov w20, v1.b[9]
; CHECK-SD-NEXT:    str w9, [sp, #40] // 4-byte Folded Spill
; CHECK-SD-NEXT:    ldr w29, [sp, #32] // 4-byte Folded Reload
; CHECK-SD-NEXT:    udiv w11, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[2]
; CHECK-SD-NEXT:    umov w9, v0.b[2]
; CHECK-SD-NEXT:    str w10, [sp, #96] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w8, w9, [sp, #16] // 8-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[3]
; CHECK-SD-NEXT:    umov w9, v0.b[3]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #48] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #24] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[4]
; CHECK-SD-NEXT:    str w8, [sp, #28] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w9, w10, [sp, #56] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w9, v0.b[4]
; CHECK-SD-NEXT:    udiv w27, w0, w5
; CHECK-SD-NEXT:    str w9, [sp, #36] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[5]
; CHECK-SD-NEXT:    umov w9, v0.b[5]
; CHECK-SD-NEXT:    str w8, [sp, #76] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w9, [sp, #84] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #44] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[6]
; CHECK-SD-NEXT:    umov w9, v0.b[6]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #64] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #92] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[7]
; CHECK-SD-NEXT:    umov w9, v0.b[7]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #112] // 8-byte Folded Spill
; CHECK-SD-NEXT:    udiv w11, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[8]
; CHECK-SD-NEXT:    umov w9, v0.b[8]
; CHECK-SD-NEXT:    str w10, [sp, #72] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w8, w9, [sp, #100] // 8-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[9]
; CHECK-SD-NEXT:    umov w9, v0.b[9]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #136] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #108] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[10]
; CHECK-SD-NEXT:    umov w9, v0.b[10]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #120] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #144] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[11]
; CHECK-SD-NEXT:    stp w9, w10, [sp, #128] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w9, v0.b[11]
; CHECK-SD-NEXT:    udiv w25, w16, w17
; CHECK-SD-NEXT:    stp w8, w9, [sp, #172] // 8-byte Folded Spill
; CHECK-SD-NEXT:    udiv w11, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[12]
; CHECK-SD-NEXT:    umov w9, v0.b[12]
; CHECK-SD-NEXT:    str w8, [sp, #152] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w9, [sp, #160] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[13]
; CHECK-SD-NEXT:    umov w9, v0.b[13]
; CHECK-SD-NEXT:    stp w8, w9, [sp, #196] // 8-byte Folded Spill
; CHECK-SD-NEXT:    str w10, [sp, #168] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[14]
; CHECK-SD-NEXT:    umov w9, v0.b[14]
; CHECK-SD-NEXT:    stp w11, w8, [sp, #180] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w11, v1.b[2]
; CHECK-SD-NEXT:    str w10, [sp, #204] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.b[15]
; CHECK-SD-NEXT:    str w8, [sp, #148] // 4-byte Folded Spill
; CHECK-SD-NEXT:    stp w9, w10, [sp, #188] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w9, v0.b[15]
; CHECK-SD-NEXT:    udiv w22, w11, w12
; CHECK-SD-NEXT:    str w9, [sp, #156] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    str w10, [sp, #164] // 4-byte Folded Spill
; CHECK-SD-NEXT:    umov w10, v1.b[0]
; CHECK-SD-NEXT:    udiv w9, w7, w19
; CHECK-SD-NEXT:    udiv w8, w3, w6
; CHECK-SD-NEXT:    udiv w23, w10, w13
; CHECK-SD-NEXT:    stp w8, w9, [sp, #8] // 8-byte Folded Spill
; CHECK-SD-NEXT:    ldr w8, [sp, #96] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w9, [sp, #88] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w9, w8, w30, w9
; CHECK-SD-NEXT:    ldr w8, [sp, #48] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w30, [sp, #40] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w8, w8, w29, w30
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #208] // 16-byte Folded Reload
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w10, w23, w13, w10
; CHECK-SD-NEXT:    udiv w24, w14, w15
; CHECK-SD-NEXT:    msub w13, w27, w5, w0
; CHECK-SD-NEXT:    ldr w5, [sp, #16] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[1], w9
; CHECK-SD-NEXT:    msub w9, w22, w12, w11
; CHECK-SD-NEXT:    umov w11, v1.b[10]
; CHECK-SD-NEXT:    fmov s2, w10
; CHECK-SD-NEXT:    ldp w10, w8, [sp, #20] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[1], w13
; CHECK-SD-NEXT:    msub w8, w8, w5, w10
; CHECK-SD-NEXT:    ldr w5, [sp, #52] // 4-byte Folded Reload
; CHECK-SD-NEXT:    umov w10, v3.b[10]
; CHECK-SD-NEXT:    udiv w28, w1, w4
; CHECK-SD-NEXT:    ldp w13, w12, [sp, #56] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[2], w9
; CHECK-SD-NEXT:    mov v0.b[2], w8
; CHECK-SD-NEXT:    msub w8, w25, w17, w16
; CHECK-SD-NEXT:    ldr w17, [sp, #28] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w16, [sp, #36] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w12, w12, w5, w13
; CHECK-SD-NEXT:    ldr w13, [sp, #44] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w5, [sp, #136] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[3], w8
; CHECK-SD-NEXT:    msub w8, w24, w15, w14
; CHECK-SD-NEXT:    ldr w15, [sp, #92] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[3], w12
; CHECK-SD-NEXT:    msub w13, w13, w17, w16
; CHECK-SD-NEXT:    ldr w17, [sp, #76] // 4-byte Folded Reload
; CHECK-SD-NEXT:    udiv w26, w18, w2
; CHECK-SD-NEXT:    ldr w16, [sp, #84] // 4-byte Folded Reload
; CHECK-SD-NEXT:    umov w12, v3.b[11]
; CHECK-SD-NEXT:    msub w15, w15, w17, w16
; CHECK-SD-NEXT:    umov w14, v1.b[11]
; CHECK-SD-NEXT:    mov v2.b[4], w8
; CHECK-SD-NEXT:    msub w8, w28, w4, w1
; CHECK-SD-NEXT:    ldr w1, [sp, #64] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[4], w13
; CHECK-SD-NEXT:    ldr w4, [sp, #100] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w17, w16, [sp, #68] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[5], w8
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #224] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[5], w15
; CHECK-SD-NEXT:    msub w16, w16, w1, w17
; CHECK-SD-NEXT:    umov w15, v3.b[12]
; CHECK-SD-NEXT:    msub w8, w26, w2, w18
; CHECK-SD-NEXT:    ldr w2, [sp, #112] // 4-byte Folded Reload
; CHECK-SD-NEXT:    udiv w0, w20, w21
; CHECK-SD-NEXT:    ldp w1, w18, [sp, #116] // 8-byte Folded Reload
; CHECK-SD-NEXT:    umov w17, v1.b[12]
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #240] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[6], w8
; CHECK-SD-NEXT:    ldr w8, [sp, #12] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[6], w16
; CHECK-SD-NEXT:    msub w18, w18, w2, w1
; CHECK-SD-NEXT:    msub w8, w8, w19, w7
; CHECK-SD-NEXT:    ldp w2, w1, [sp, #104] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[7], w18
; CHECK-SD-NEXT:    umov w18, v3.b[13]
; CHECK-SD-NEXT:    mov v2.b[7], w8
; CHECK-SD-NEXT:    ldr w8, [sp, #8] // 4-byte Folded Reload
; CHECK-SD-NEXT:    udiv w9, w11, w10
; CHECK-SD-NEXT:    msub w1, w1, w4, w2
; CHECK-SD-NEXT:    umov w2, v1.b[13]
; CHECK-SD-NEXT:    msub w8, w8, w6, w3
; CHECK-SD-NEXT:    ldp w4, w3, [sp, #140] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[8], w1
; CHECK-SD-NEXT:    mov v2.b[8], w8
; CHECK-SD-NEXT:    msub w8, w0, w21, w20
; CHECK-SD-NEXT:    msub w3, w3, w5, w4
; CHECK-SD-NEXT:    ldr w5, [sp, #124] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w4, w1, [sp, #128] // 8-byte Folded Reload
; CHECK-SD-NEXT:    udiv w13, w14, w12
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[9], w8
; CHECK-SD-NEXT:    mov v0.b[9], w3
; CHECK-SD-NEXT:    msub w8, w9, w10, w11
; CHECK-SD-NEXT:    msub w1, w1, w5, w4
; CHECK-SD-NEXT:    ldr w4, [sp, #172] // 4-byte Folded Reload
; CHECK-SD-NEXT:    umov w9, v3.b[14]
; CHECK-SD-NEXT:    ldp w3, w11, [sp, #176] // 8-byte Folded Reload
; CHECK-SD-NEXT:    umov w10, v1.b[14]
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[10], w8
; CHECK-SD-NEXT:    mov v0.b[10], w1
; CHECK-SD-NEXT:    ldr w1, [sp, #152] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w11, w11, w4, w3
; CHECK-SD-NEXT:    udiv w16, w17, w15
; CHECK-SD-NEXT:    msub w8, w13, w12, w14
; CHECK-SD-NEXT:    ldr w13, [sp, #168] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w14, [sp, #160] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[11], w11
; CHECK-SD-NEXT:    umov w11, v3.b[15]
; CHECK-SD-NEXT:    msub w13, w13, w1, w14
; CHECK-SD-NEXT:    umov w14, v1.b[15]
; CHECK-SD-NEXT:    mov v2.b[11], w8
; CHECK-SD-NEXT:    mov v0.b[12], w13
; CHECK-SD-NEXT:    udiv w0, w2, w18
; CHECK-SD-NEXT:    msub w8, w16, w15, w17
; CHECK-SD-NEXT:    ldr w17, [sp, #196] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldp w16, w15, [sp, #200] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[12], w8
; CHECK-SD-NEXT:    msub w15, w15, w17, w16
; CHECK-SD-NEXT:    ldp w17, w16, [sp, #188] // 8-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.b[13], w15
; CHECK-SD-NEXT:    udiv w12, w10, w9
; CHECK-SD-NEXT:    msub w8, w0, w18, w2
; CHECK-SD-NEXT:    ldr w18, [sp, #184] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w16, w16, w18, w17
; CHECK-SD-NEXT:    mov v2.b[13], w8
; CHECK-SD-NEXT:    mov v0.b[14], w16
; CHECK-SD-NEXT:    udiv w13, w14, w11
; CHECK-SD-NEXT:    msub w8, w12, w9, w10
; CHECK-SD-NEXT:    ldr w9, [sp, #164] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w12, [sp, #148] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w10, [sp, #156] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.b[14], w8
; CHECK-SD-NEXT:    msub w9, w9, w12, w10
; CHECK-SD-NEXT:    mov v0.b[15], w9
; CHECK-SD-NEXT:    msub w8, w13, w11, w14
; CHECK-SD-NEXT:    mov v2.b[15], w8
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    add sp, sp, #304
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv32i8:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -88
; CHECK-GI-NEXT:    .cfi_offset w29, -96
; CHECK-GI-NEXT:    ushll v4.8h, v0.8b, #0
; CHECK-GI-NEXT:    ushll v5.8h, v2.8b, #0
; CHECK-GI-NEXT:    ushll v16.8h, v1.8b, #0
; CHECK-GI-NEXT:    ushll v17.8h, v3.8b, #0
; CHECK-GI-NEXT:    ushll v6.4s, v4.4h, #0
; CHECK-GI-NEXT:    ushll v7.4s, v5.4h, #0
; CHECK-GI-NEXT:    ushll2 v4.4s, v4.8h, #0
; CHECK-GI-NEXT:    ushll2 v5.4s, v5.8h, #0
; CHECK-GI-NEXT:    ushll v18.4s, v16.4h, #0
; CHECK-GI-NEXT:    ushll v19.4s, v17.4h, #0
; CHECK-GI-NEXT:    ushll2 v16.4s, v16.8h, #0
; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
; CHECK-GI-NEXT:    fmov w8, s6
; CHECK-GI-NEXT:    fmov w9, s7
; CHECK-GI-NEXT:    mov w12, v7.s[3]
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    fmov w6, s19
; CHECK-GI-NEXT:    mov w7, v19.s[3]
; CHECK-GI-NEXT:    fmov w21, s17
; CHECK-GI-NEXT:    mov w23, v17.s[3]
; CHECK-GI-NEXT:    udiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[1]
; CHECK-GI-NEXT:    mov w9, v7.s[1]
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[2]
; CHECK-GI-NEXT:    mov w9, v7.s[2]
; CHECK-GI-NEXT:    mov v20.s[0], w11
; CHECK-GI-NEXT:    udiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v6.s[3]
; CHECK-GI-NEXT:    ushll2 v6.8h, v0.16b, #0
; CHECK-GI-NEXT:    mov v20.s[1], w10
; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
; CHECK-GI-NEXT:    ushll v28.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v20.s[2], w9
; CHECK-GI-NEXT:    udiv w15, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov w13, v5.s[1]
; CHECK-GI-NEXT:    mov v20.s[3], w8
; CHECK-GI-NEXT:    udiv w14, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov w13, v5.s[2]
; CHECK-GI-NEXT:    ushll v5.4s, v6.4h, #0
; CHECK-GI-NEXT:    mov v21.s[0], w15
; CHECK-GI-NEXT:    udiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    ushll2 v4.8h, v2.16b, #0
; CHECK-GI-NEXT:    mov v21.s[1], w14
; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
; CHECK-GI-NEXT:    ushll v7.4s, v4.4h, #0
; CHECK-GI-NEXT:    ushll v30.4s, v2.4h, #0
; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
; CHECK-GI-NEXT:    fmov w17, s7
; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
; CHECK-GI-NEXT:    udiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s5
; CHECK-GI-NEXT:    mov v21.s[2], w13
; CHECK-GI-NEXT:    udiv w1, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[1]
; CHECK-GI-NEXT:    mov w17, v7.s[1]
; CHECK-GI-NEXT:    mov v21.s[3], w12
; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
; CHECK-GI-NEXT:    udiv w0, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[2]
; CHECK-GI-NEXT:    mov w17, v7.s[2]
; CHECK-GI-NEXT:    mov v22.s[0], w1
; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
; CHECK-GI-NEXT:    udiv w18, w16, w17
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    mov w17, v7.s[3]
; CHECK-GI-NEXT:    ushll2 v5.4s, v6.8h, #0
; CHECK-GI-NEXT:    ushll2 v7.4s, v4.8h, #0
; CHECK-GI-NEXT:    mov v22.s[1], w0
; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w4, v7.s[3]
; CHECK-GI-NEXT:    udiv w16, w16, w17
; CHECK-GI-NEXT:    fmov w17, s5
; CHECK-GI-NEXT:    mov v22.s[2], w18
; CHECK-GI-NEXT:    udiv w5, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[1]
; CHECK-GI-NEXT:    mov w2, v7.s[1]
; CHECK-GI-NEXT:    mov v22.s[3], w16
; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
; CHECK-GI-NEXT:    udiv w3, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[2]
; CHECK-GI-NEXT:    mov w2, v7.s[2]
; CHECK-GI-NEXT:    mov v23.s[0], w5
; CHECK-GI-NEXT:    udiv w2, w17, w2
; CHECK-GI-NEXT:    mov w17, v5.s[3]
; CHECK-GI-NEXT:    mov v23.s[1], w3
; CHECK-GI-NEXT:    udiv w17, w17, w4
; CHECK-GI-NEXT:    fmov w4, s18
; CHECK-GI-NEXT:    mov v23.s[2], w2
; CHECK-GI-NEXT:    udiv w20, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[1]
; CHECK-GI-NEXT:    mov w6, v19.s[1]
; CHECK-GI-NEXT:    mov v23.s[3], w17
; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
; CHECK-GI-NEXT:    udiv w19, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[2]
; CHECK-GI-NEXT:    mov w6, v19.s[2]
; CHECK-GI-NEXT:    mov v24.s[0], w20
; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT:    udiv w6, w4, w6
; CHECK-GI-NEXT:    mov w4, v18.s[3]
; CHECK-GI-NEXT:    mov v24.s[1], w19
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    udiv w4, w4, w7
; CHECK-GI-NEXT:    fmov w7, s16
; CHECK-GI-NEXT:    mov v24.s[2], w6
; CHECK-GI-NEXT:    udiv w24, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[1]
; CHECK-GI-NEXT:    mov w21, v17.s[1]
; CHECK-GI-NEXT:    mov v24.s[3], w4
; CHECK-GI-NEXT:    udiv w22, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[2]
; CHECK-GI-NEXT:    mov w21, v17.s[2]
; CHECK-GI-NEXT:    ushll2 v17.8h, v1.16b, #0
; CHECK-GI-NEXT:    mov v25.s[0], w24
; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
; CHECK-GI-NEXT:    ushll v18.4s, v17.4h, #0
; CHECK-GI-NEXT:    ushll v29.4s, v1.4h, #0
; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    udiv w21, w7, w21
; CHECK-GI-NEXT:    mov w7, v16.s[3]
; CHECK-GI-NEXT:    ushll2 v16.8h, v3.16b, #0
; CHECK-GI-NEXT:    mov v25.s[1], w22
; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
; CHECK-GI-NEXT:    ushll v19.4s, v16.4h, #0
; CHECK-GI-NEXT:    ushll v31.4s, v3.4h, #0
; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
; CHECK-GI-NEXT:    fmov w25, s19
; CHECK-GI-NEXT:    mov w26, v19.s[1]
; CHECK-GI-NEXT:    mov w27, v19.s[2]
; CHECK-GI-NEXT:    mov w28, v19.s[3]
; CHECK-GI-NEXT:    ushll2 v19.4s, v16.8h, #0
; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
; CHECK-GI-NEXT:    udiv w7, w7, w23
; CHECK-GI-NEXT:    fmov w23, s18
; CHECK-GI-NEXT:    mov v25.s[2], w21
; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    fmov w29, s19
; CHECK-GI-NEXT:    mov w30, v19.s[1]
; CHECK-GI-NEXT:    mov w15, v19.s[2]
; CHECK-GI-NEXT:    udiv w25, w23, w25
; CHECK-GI-NEXT:    mov w23, v18.s[1]
; CHECK-GI-NEXT:    mov v25.s[3], w7
; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
; CHECK-GI-NEXT:    udiv w26, w23, w26
; CHECK-GI-NEXT:    mov w23, v18.s[2]
; CHECK-GI-NEXT:    mov v26.s[0], w25
; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
; CHECK-GI-NEXT:    udiv w27, w23, w27
; CHECK-GI-NEXT:    mov w23, v18.s[3]
; CHECK-GI-NEXT:    ushll2 v18.4s, v17.8h, #0
; CHECK-GI-NEXT:    mov v26.s[1], w26
; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    mov w11, v18.s[2]
; CHECK-GI-NEXT:    mov w9, v18.s[3]
; CHECK-GI-NEXT:    udiv w23, w23, w28
; CHECK-GI-NEXT:    fmov w28, s18
; CHECK-GI-NEXT:    mov v26.s[2], w27
; CHECK-GI-NEXT:    udiv w28, w28, w29
; CHECK-GI-NEXT:    mov w29, v18.s[1]
; CHECK-GI-NEXT:    mov v26.s[3], w23
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
; CHECK-GI-NEXT:    udiv w29, w29, w30
; CHECK-GI-NEXT:    mov v27.s[0], w28
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    udiv w10, w11, w15
; CHECK-GI-NEXT:    mov w11, v19.s[3]
; CHECK-GI-NEXT:    mov v27.s[1], w29
; CHECK-GI-NEXT:    udiv w8, w9, w11
; CHECK-GI-NEXT:    mov v27.s[2], w10
; CHECK-GI-NEXT:    mov v27.s[3], w8
; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <32 x i8> %d, %e
  ret <32 x i8> %s
}

define <2 x i16> @sv2i16(<2 x i16> %d, <2 x i16> %e) {
; CHECK-SD-LABEL: sv2i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT:    sshr v0.2s, v0.2s, #16
; CHECK-SD-NEXT:    sshr v1.2s, v1.2s, #16
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv2i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #16
; CHECK-GI-NEXT:    shl v1.2s, v1.2s, #16
; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #16
; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #16
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <2 x i16> %d, %e
  ret <2 x i16> %s
}

define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
; CHECK-SD-LABEL: sv3i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    smov w11, v1.h[0]
; CHECK-SD-NEXT:    smov w12, v0.h[0]
; CHECK-SD-NEXT:    smov w8, v1.h[1]
; CHECK-SD-NEXT:    smov w9, v0.h[1]
; CHECK-SD-NEXT:    smov w14, v1.h[2]
; CHECK-SD-NEXT:    smov w15, v0.h[2]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv3i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    smov w8, v0.h[0]
; CHECK-GI-NEXT:    smov w9, v1.h[0]
; CHECK-GI-NEXT:    smov w11, v0.h[1]
; CHECK-GI-NEXT:    smov w12, v1.h[1]
; CHECK-GI-NEXT:    smov w14, v0.h[2]
; CHECK-GI-NEXT:    smov w15, v1.h[2]
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    sdiv w13, w11, w12
; CHECK-GI-NEXT:    msub w8, w10, w9, w8
; CHECK-GI-NEXT:    fmov s0, w8
; CHECK-GI-NEXT:    sdiv w16, w14, w15
; CHECK-GI-NEXT:    msub w9, w13, w12, w11
; CHECK-GI-NEXT:    mov v0.h[1], w9
; CHECK-GI-NEXT:    msub w8, w16, w15, w14
; CHECK-GI-NEXT:    mov v0.h[2], w8
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <3 x i16> %d, %e
  ret <3 x i16> %s
}

define <4 x i16> @sv4i16(<4 x i16> %d, <4 x i16> %e) {
; CHECK-SD-LABEL: sv4i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    smov w11, v1.h[0]
; CHECK-SD-NEXT:    smov w12, v0.h[0]
; CHECK-SD-NEXT:    smov w8, v1.h[1]
; CHECK-SD-NEXT:    smov w9, v0.h[1]
; CHECK-SD-NEXT:    smov w14, v1.h[2]
; CHECK-SD-NEXT:    smov w15, v0.h[2]
; CHECK-SD-NEXT:    smov w17, v1.h[3]
; CHECK-SD-NEXT:    smov w18, v0.h[3]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    sdiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.h[3], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv4i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    sdiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <4 x i16> %d, %e
  ret <4 x i16> %s
}

define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) {
; CHECK-SD-LABEL: sv8i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    smov w11, v1.h[0]
; CHECK-SD-NEXT:    smov w12, v0.h[0]
; CHECK-SD-NEXT:    smov w8, v1.h[1]
; CHECK-SD-NEXT:    smov w9, v0.h[1]
; CHECK-SD-NEXT:    smov w14, v1.h[2]
; CHECK-SD-NEXT:    smov w15, v0.h[2]
; CHECK-SD-NEXT:    smov w17, v1.h[3]
; CHECK-SD-NEXT:    smov w18, v0.h[3]
; CHECK-SD-NEXT:    smov w1, v1.h[4]
; CHECK-SD-NEXT:    smov w2, v0.h[4]
; CHECK-SD-NEXT:    smov w4, v1.h[5]
; CHECK-SD-NEXT:    smov w5, v0.h[5]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    smov w13, v1.h[7]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    smov w11, v0.h[6]
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    smov w10, v1.h[6]
; CHECK-SD-NEXT:    mov v2.h[1], w8
; CHECK-SD-NEXT:    sdiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    smov w14, v0.h[7]
; CHECK-SD-NEXT:    mov v2.h[2], w8
; CHECK-SD-NEXT:    sdiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    mov v2.h[3], w8
; CHECK-SD-NEXT:    sdiv w9, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    mov v2.h[4], w8
; CHECK-SD-NEXT:    sdiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w4, w5
; CHECK-SD-NEXT:    mov v2.h[5], w8
; CHECK-SD-NEXT:    sdiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.h[6], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.h[7], w8
; CHECK-SD-NEXT:    mov v0.16b, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv8i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w10, v3.s[1]
; CHECK-GI-NEXT:    mov w11, v3.s[2]
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s1
; CHECK-GI-NEXT:    mov w14, v1.s[1]
; CHECK-GI-NEXT:    mov w15, v1.s[2]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v2.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v2.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v0.s[3]
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v2.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    sdiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s0
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    sdiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v0.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
; CHECK-GI-NEXT:    sdiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v0.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    sdiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <8 x i16> %d, %e
  ret <8 x i16> %s
}

define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
; CHECK-SD-LABEL: sv16i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #160
; CHECK-SD-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    smov w8, v2.h[1]
; CHECK-SD-NEXT:    smov w9, v0.h[1]
; CHECK-SD-NEXT:    smov w19, v2.h[2]
; CHECK-SD-NEXT:    smov w22, v0.h[2]
; CHECK-SD-NEXT:    smov w1, v2.h[0]
; CHECK-SD-NEXT:    smov w3, v0.h[0]
; CHECK-SD-NEXT:    smov w7, v2.h[3]
; CHECK-SD-NEXT:    smov w18, v0.h[3]
; CHECK-SD-NEXT:    smov w4, v0.h[6]
; CHECK-SD-NEXT:    smov w0, v2.h[4]
; CHECK-SD-NEXT:    smov w5, v0.h[4]
; CHECK-SD-NEXT:    smov w2, v2.h[7]
; CHECK-SD-NEXT:    str w8, [sp, #52] // 4-byte Folded Spill
; CHECK-SD-NEXT:    smov w6, v0.h[7]
; CHECK-SD-NEXT:    smov w27, v3.h[0]
; CHECK-SD-NEXT:    str w9, [sp, #44] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w9, w9, w8
; CHECK-SD-NEXT:    smov w28, v1.h[0]
; CHECK-SD-NEXT:    smov w24, v3.h[1]
; CHECK-SD-NEXT:    smov w25, v1.h[1]
; CHECK-SD-NEXT:    ldr w21, [sp, #52] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w23, [sp, #44] // 4-byte Folded Reload
; CHECK-SD-NEXT:    smov w30, v3.h[2]
; CHECK-SD-NEXT:    smov w12, v3.h[3]
; CHECK-SD-NEXT:    smov w11, v1.h[3]
; CHECK-SD-NEXT:    smov w14, v3.h[5]
; CHECK-SD-NEXT:    smov w13, v1.h[5]
; CHECK-SD-NEXT:    sdiv w8, w22, w19
; CHECK-SD-NEXT:    str w9, [sp, #60] // 4-byte Folded Spill
; CHECK-SD-NEXT:    ldr w20, [sp, #60] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w21, w20, w21, w23
; CHECK-SD-NEXT:    sdiv w9, w3, w1
; CHECK-SD-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w8, w18, w7
; CHECK-SD-NEXT:    stp w9, w8, [sp, #24] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w8, v2.h[5]
; CHECK-SD-NEXT:    smov w9, v0.h[5]
; CHECK-SD-NEXT:    sdiv w10, w5, w0
; CHECK-SD-NEXT:    ldr w20, [sp, #24] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w1, w20, w1, w3
; CHECK-SD-NEXT:    str w9, [sp, #40] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w8, [sp, #48] // 4-byte Folded Spill
; CHECK-SD-NEXT:    fmov s0, w1
; CHECK-SD-NEXT:    ldr w1, [sp, #12] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w1, w1, w19, w22
; CHECK-SD-NEXT:    ldr w19, [sp, #28] // 4-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w9, w9, w8
; CHECK-SD-NEXT:    smov w8, v2.h[6]
; CHECK-SD-NEXT:    mov v0.h[1], w21
; CHECK-SD-NEXT:    msub w18, w19, w7, w18
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.h[2], w1
; CHECK-SD-NEXT:    str w9, [sp, #56] // 4-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w9, w4, w8
; CHECK-SD-NEXT:    mov v0.h[3], w18
; CHECK-SD-NEXT:    ldr w18, [sp, #40] // 4-byte Folded Reload
; CHECK-SD-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
; CHECK-SD-NEXT:    sdiv w8, w6, w2
; CHECK-SD-NEXT:    smov w9, v1.h[4]
; CHECK-SD-NEXT:    sdiv w29, w28, w27
; CHECK-SD-NEXT:    stp w8, w10, [sp, #16] // 8-byte Folded Spill
; CHECK-SD-NEXT:    smov w8, v1.h[2]
; CHECK-SD-NEXT:    smov w10, v3.h[4]
; CHECK-SD-NEXT:    sdiv w26, w25, w24
; CHECK-SD-NEXT:    msub w3, w29, w27, w28
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    fmov s2, w3
; CHECK-SD-NEXT:    smov w3, v1.h[6]
; CHECK-SD-NEXT:    sdiv w15, w8, w30
; CHECK-SD-NEXT:    msub w24, w26, w24, w25
; CHECK-SD-NEXT:    mov v2.h[1], w24
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
; CHECK-SD-NEXT:    sdiv w17, w11, w12
; CHECK-SD-NEXT:    msub w8, w15, w30, w8
; CHECK-SD-NEXT:    smov w15, v3.h[6]
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[2], w8
; CHECK-SD-NEXT:    sdiv w16, w9, w10
; CHECK-SD-NEXT:    msub w8, w17, w12, w11
; CHECK-SD-NEXT:    ldr w12, [sp, #20] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w17, [sp, #48] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w12, w12, w0, w5
; CHECK-SD-NEXT:    mov v2.h[3], w8
; CHECK-SD-NEXT:    mov v0.h[4], w12
; CHECK-SD-NEXT:    sdiv w25, w13, w14
; CHECK-SD-NEXT:    msub w8, w16, w10, w9
; CHECK-SD-NEXT:    smov w9, v3.h[7]
; CHECK-SD-NEXT:    smov w10, v1.h[7]
; CHECK-SD-NEXT:    ldr w16, [sp, #56] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[4], w8
; CHECK-SD-NEXT:    msub w16, w16, w17, w18
; CHECK-SD-NEXT:    mov v0.h[5], w16
; CHECK-SD-NEXT:    sdiv w11, w3, w15
; CHECK-SD-NEXT:    msub w8, w25, w14, w13
; CHECK-SD-NEXT:    ldp w14, w13, [sp, #32] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[5], w8
; CHECK-SD-NEXT:    msub w13, w13, w14, w4
; CHECK-SD-NEXT:    mov v0.h[6], w13
; CHECK-SD-NEXT:    sdiv w12, w10, w9
; CHECK-SD-NEXT:    msub w8, w11, w15, w3
; CHECK-SD-NEXT:    ldr w11, [sp, #16] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w11, w11, w2, w6
; CHECK-SD-NEXT:    mov v2.h[6], w8
; CHECK-SD-NEXT:    mov v0.h[7], w11
; CHECK-SD-NEXT:    msub w8, w12, w9, w10
; CHECK-SD-NEXT:    mov v2.h[7], w8
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    add sp, sp, #160
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv16i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
; CHECK-GI-NEXT:    sshll v5.4s, v2.4h, #0
; CHECK-GI-NEXT:    sshll v6.4s, v1.4h, #0
; CHECK-GI-NEXT:    sshll v7.4s, v3.4h, #0
; CHECK-GI-NEXT:    fmov w8, s4
; CHECK-GI-NEXT:    fmov w9, s5
; CHECK-GI-NEXT:    mov w12, v5.s[3]
; CHECK-GI-NEXT:    fmov w17, s7
; CHECK-GI-NEXT:    mov w18, v7.s[1]
; CHECK-GI-NEXT:    mov w0, v7.s[2]
; CHECK-GI-NEXT:    mov w1, v7.s[3]
; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
; CHECK-GI-NEXT:    sdiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[1]
; CHECK-GI-NEXT:    mov w9, v5.s[1]
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w3, v7.s[1]
; CHECK-GI-NEXT:    mov w4, v7.s[2]
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[2]
; CHECK-GI-NEXT:    mov w9, v5.s[2]
; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
; CHECK-GI-NEXT:    mov v16.s[0], w11
; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w14, v5.s[1]
; CHECK-GI-NEXT:    mov w15, v5.s[2]
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    sdiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[3]
; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
; CHECK-GI-NEXT:    mov v16.s[1], w10
; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v16.s[2], w9
; CHECK-GI-NEXT:    sdiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov v16.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
; CHECK-GI-NEXT:    sdiv w14, w12, w14
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov v17.s[0], w13
; CHECK-GI-NEXT:    mov w13, v7.s[3]
; CHECK-GI-NEXT:    sdiv w15, w12, w15
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    mov v17.s[1], w14
; CHECK-GI-NEXT:    sdiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s6
; CHECK-GI-NEXT:    mov v17.s[2], w15
; CHECK-GI-NEXT:    sdiv w16, w16, w17
; CHECK-GI-NEXT:    mov w17, v6.s[1]
; CHECK-GI-NEXT:    mov v17.s[3], w12
; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
; CHECK-GI-NEXT:    sdiv w17, w17, w18
; CHECK-GI-NEXT:    mov w18, v6.s[2]
; CHECK-GI-NEXT:    mov v18.s[0], w16
; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
; CHECK-GI-NEXT:    sdiv w18, w18, w0
; CHECK-GI-NEXT:    mov w0, v6.s[3]
; CHECK-GI-NEXT:    sshll2 v6.4s, v1.8h, #0
; CHECK-GI-NEXT:    mov v18.s[1], w17
; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    mov w11, v6.s[3]
; CHECK-GI-NEXT:    sdiv w0, w0, w1
; CHECK-GI-NEXT:    fmov w1, s6
; CHECK-GI-NEXT:    mov v18.s[2], w18
; CHECK-GI-NEXT:    sdiv w1, w1, w2
; CHECK-GI-NEXT:    mov w2, v6.s[1]
; CHECK-GI-NEXT:    mov v18.s[3], w0
; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
; CHECK-GI-NEXT:    sdiv w2, w2, w3
; CHECK-GI-NEXT:    mov w3, v6.s[2]
; CHECK-GI-NEXT:    mov v19.s[0], w1
; CHECK-GI-NEXT:    sdiv w3, w3, w4
; CHECK-GI-NEXT:    mov v19.s[1], w2
; CHECK-GI-NEXT:    sdiv w10, w11, w13
; CHECK-GI-NEXT:    mov v19.s[2], w3
; CHECK-GI-NEXT:    mov v19.s[3], w10
; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v6.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <16 x i16> %d, %e
  ret <16 x i16> %s
}

define <2 x i16> @uv2i16(<2 x i16> %d, <2 x i16> %e) {
; CHECK-SD-LABEL: uv2i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv2i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    movi d2, #0x00ffff0000ffff
; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <2 x i16> %d, %e
  ret <2 x i16> %s
}

define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
; CHECK-SD-LABEL: uv3i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    umov w11, v1.h[0]
; CHECK-SD-NEXT:    umov w12, v0.h[0]
; CHECK-SD-NEXT:    umov w8, v1.h[1]
; CHECK-SD-NEXT:    umov w9, v0.h[1]
; CHECK-SD-NEXT:    umov w13, v0.h[2]
; CHECK-SD-NEXT:    umov w14, v1.h[0]
; CHECK-SD-NEXT:    umov w16, v0.h[0]
; CHECK-SD-NEXT:    udiv w11, w12, w11
; CHECK-SD-NEXT:    umov w12, v1.h[2]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w11, w14, w16
; CHECK-SD-NEXT:    udiv w15, w13, w12
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    sxth w9, w11
; CHECK-SD-NEXT:    fmov s0, w9
; CHECK-SD-NEXT:    sxth w8, w8
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    msub w10, w15, w12, w13
; CHECK-SD-NEXT:    sxth w8, w10
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv3i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    umov w8, v0.h[0]
; CHECK-GI-NEXT:    umov w9, v1.h[0]
; CHECK-GI-NEXT:    umov w11, v0.h[1]
; CHECK-GI-NEXT:    umov w12, v1.h[1]
; CHECK-GI-NEXT:    umov w14, v0.h[2]
; CHECK-GI-NEXT:    umov w15, v1.h[2]
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    udiv w13, w11, w12
; CHECK-GI-NEXT:    msub w8, w10, w9, w8
; CHECK-GI-NEXT:    fmov s0, w8
; CHECK-GI-NEXT:    udiv w16, w14, w15
; CHECK-GI-NEXT:    msub w9, w13, w12, w11
; CHECK-GI-NEXT:    mov v0.h[1], w9
; CHECK-GI-NEXT:    msub w8, w16, w15, w14
; CHECK-GI-NEXT:    mov v0.h[2], w8
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <3 x i16> %d, %e
  ret <3 x i16> %s
}

define <4 x i16> @uv4i16(<4 x i16> %d, <4 x i16> %e) {
; CHECK-SD-LABEL: uv4i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    umov w11, v1.h[0]
; CHECK-SD-NEXT:    umov w12, v0.h[0]
; CHECK-SD-NEXT:    umov w8, v1.h[1]
; CHECK-SD-NEXT:    umov w9, v0.h[1]
; CHECK-SD-NEXT:    umov w14, v1.h[2]
; CHECK-SD-NEXT:    umov w15, v0.h[2]
; CHECK-SD-NEXT:    umov w17, v1.h[3]
; CHECK-SD-NEXT:    umov w18, v0.h[3]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.h[1], w8
; CHECK-SD-NEXT:    udiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.h[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.h[3], w8
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv4i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    udiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <4 x i16> %d, %e
  ret <4 x i16> %s
}

define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) {
; CHECK-SD-LABEL: uv8i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    umov w11, v1.h[0]
; CHECK-SD-NEXT:    umov w12, v0.h[0]
; CHECK-SD-NEXT:    umov w8, v1.h[1]
; CHECK-SD-NEXT:    umov w9, v0.h[1]
; CHECK-SD-NEXT:    umov w14, v1.h[2]
; CHECK-SD-NEXT:    umov w15, v0.h[2]
; CHECK-SD-NEXT:    umov w17, v1.h[3]
; CHECK-SD-NEXT:    umov w18, v0.h[3]
; CHECK-SD-NEXT:    umov w1, v1.h[4]
; CHECK-SD-NEXT:    umov w2, v0.h[4]
; CHECK-SD-NEXT:    umov w4, v1.h[5]
; CHECK-SD-NEXT:    umov w5, v0.h[5]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    umov w13, v1.h[7]
; CHECK-SD-NEXT:    fmov s2, w11
; CHECK-SD-NEXT:    umov w11, v0.h[6]
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    umov w10, v1.h[6]
; CHECK-SD-NEXT:    mov v2.h[1], w8
; CHECK-SD-NEXT:    udiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    umov w14, v0.h[7]
; CHECK-SD-NEXT:    mov v2.h[2], w8
; CHECK-SD-NEXT:    udiv w3, w2, w1
; CHECK-SD-NEXT:    msub w8, w0, w17, w18
; CHECK-SD-NEXT:    mov v2.h[3], w8
; CHECK-SD-NEXT:    udiv w9, w5, w4
; CHECK-SD-NEXT:    msub w8, w3, w1, w2
; CHECK-SD-NEXT:    mov v2.h[4], w8
; CHECK-SD-NEXT:    udiv w12, w11, w10
; CHECK-SD-NEXT:    msub w8, w9, w4, w5
; CHECK-SD-NEXT:    mov v2.h[5], w8
; CHECK-SD-NEXT:    udiv w9, w14, w13
; CHECK-SD-NEXT:    msub w8, w12, w10, w11
; CHECK-SD-NEXT:    mov v2.h[6], w8
; CHECK-SD-NEXT:    msub w8, w9, w13, w14
; CHECK-SD-NEXT:    mov v2.h[7], w8
; CHECK-SD-NEXT:    mov v0.16b, v2.16b
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv8i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT:    fmov w8, s2
; CHECK-GI-NEXT:    fmov w9, s3
; CHECK-GI-NEXT:    mov w10, v3.s[1]
; CHECK-GI-NEXT:    mov w11, v3.s[2]
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    fmov w13, s1
; CHECK-GI-NEXT:    mov w14, v1.s[1]
; CHECK-GI-NEXT:    mov w15, v1.s[2]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v2.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v2.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v0.s[3]
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v2.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    udiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s0
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    udiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v0.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
; CHECK-GI-NEXT:    udiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v0.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    udiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
; CHECK-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <8 x i16> %d, %e
  ret <8 x i16> %s
}

define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
; CHECK-SD-LABEL: uv16i16:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #160
; CHECK-SD-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #96] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #112] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #128] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 160
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    umov w8, v2.h[1]
; CHECK-SD-NEXT:    umov w9, v0.h[1]
; CHECK-SD-NEXT:    umov w19, v2.h[2]
; CHECK-SD-NEXT:    umov w22, v0.h[2]
; CHECK-SD-NEXT:    umov w1, v2.h[0]
; CHECK-SD-NEXT:    umov w3, v0.h[0]
; CHECK-SD-NEXT:    umov w7, v2.h[3]
; CHECK-SD-NEXT:    umov w18, v0.h[3]
; CHECK-SD-NEXT:    umov w4, v0.h[6]
; CHECK-SD-NEXT:    umov w0, v2.h[4]
; CHECK-SD-NEXT:    umov w5, v0.h[4]
; CHECK-SD-NEXT:    umov w2, v2.h[7]
; CHECK-SD-NEXT:    str w8, [sp, #52] // 4-byte Folded Spill
; CHECK-SD-NEXT:    umov w6, v0.h[7]
; CHECK-SD-NEXT:    umov w27, v3.h[0]
; CHECK-SD-NEXT:    str w9, [sp, #44] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w9, w9, w8
; CHECK-SD-NEXT:    umov w28, v1.h[0]
; CHECK-SD-NEXT:    umov w24, v3.h[1]
; CHECK-SD-NEXT:    umov w25, v1.h[1]
; CHECK-SD-NEXT:    ldr w21, [sp, #52] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w23, [sp, #44] // 4-byte Folded Reload
; CHECK-SD-NEXT:    umov w30, v3.h[2]
; CHECK-SD-NEXT:    umov w12, v3.h[3]
; CHECK-SD-NEXT:    umov w11, v1.h[3]
; CHECK-SD-NEXT:    umov w14, v3.h[5]
; CHECK-SD-NEXT:    umov w13, v1.h[5]
; CHECK-SD-NEXT:    udiv w8, w22, w19
; CHECK-SD-NEXT:    str w9, [sp, #60] // 4-byte Folded Spill
; CHECK-SD-NEXT:    ldr w20, [sp, #60] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w21, w20, w21, w23
; CHECK-SD-NEXT:    udiv w9, w3, w1
; CHECK-SD-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w8, w18, w7
; CHECK-SD-NEXT:    stp w9, w8, [sp, #24] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w8, v2.h[5]
; CHECK-SD-NEXT:    umov w9, v0.h[5]
; CHECK-SD-NEXT:    udiv w10, w5, w0
; CHECK-SD-NEXT:    ldr w20, [sp, #24] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w1, w20, w1, w3
; CHECK-SD-NEXT:    str w9, [sp, #40] // 4-byte Folded Spill
; CHECK-SD-NEXT:    str w8, [sp, #48] // 4-byte Folded Spill
; CHECK-SD-NEXT:    fmov s0, w1
; CHECK-SD-NEXT:    ldr w1, [sp, #12] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w1, w1, w19, w22
; CHECK-SD-NEXT:    ldr w19, [sp, #28] // 4-byte Folded Reload
; CHECK-SD-NEXT:    udiv w9, w9, w8
; CHECK-SD-NEXT:    umov w8, v2.h[6]
; CHECK-SD-NEXT:    mov v0.h[1], w21
; CHECK-SD-NEXT:    msub w18, w19, w7, w18
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #128] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v0.h[2], w1
; CHECK-SD-NEXT:    str w9, [sp, #56] // 4-byte Folded Spill
; CHECK-SD-NEXT:    udiv w9, w4, w8
; CHECK-SD-NEXT:    mov v0.h[3], w18
; CHECK-SD-NEXT:    ldr w18, [sp, #40] // 4-byte Folded Reload
; CHECK-SD-NEXT:    stp w8, w9, [sp, #32] // 8-byte Folded Spill
; CHECK-SD-NEXT:    udiv w8, w6, w2
; CHECK-SD-NEXT:    umov w9, v1.h[4]
; CHECK-SD-NEXT:    udiv w29, w28, w27
; CHECK-SD-NEXT:    stp w8, w10, [sp, #16] // 8-byte Folded Spill
; CHECK-SD-NEXT:    umov w8, v1.h[2]
; CHECK-SD-NEXT:    umov w10, v3.h[4]
; CHECK-SD-NEXT:    udiv w26, w25, w24
; CHECK-SD-NEXT:    msub w3, w29, w27, w28
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    fmov s2, w3
; CHECK-SD-NEXT:    umov w3, v1.h[6]
; CHECK-SD-NEXT:    udiv w15, w8, w30
; CHECK-SD-NEXT:    msub w24, w26, w24, w25
; CHECK-SD-NEXT:    mov v2.h[1], w24
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #112] // 16-byte Folded Reload
; CHECK-SD-NEXT:    udiv w17, w11, w12
; CHECK-SD-NEXT:    msub w8, w15, w30, w8
; CHECK-SD-NEXT:    umov w15, v3.h[6]
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[2], w8
; CHECK-SD-NEXT:    udiv w16, w9, w10
; CHECK-SD-NEXT:    msub w8, w17, w12, w11
; CHECK-SD-NEXT:    ldr w12, [sp, #20] // 4-byte Folded Reload
; CHECK-SD-NEXT:    ldr w17, [sp, #48] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w12, w12, w0, w5
; CHECK-SD-NEXT:    mov v2.h[3], w8
; CHECK-SD-NEXT:    mov v0.h[4], w12
; CHECK-SD-NEXT:    udiv w25, w13, w14
; CHECK-SD-NEXT:    msub w8, w16, w10, w9
; CHECK-SD-NEXT:    umov w9, v3.h[7]
; CHECK-SD-NEXT:    umov w10, v1.h[7]
; CHECK-SD-NEXT:    ldr w16, [sp, #56] // 4-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[4], w8
; CHECK-SD-NEXT:    msub w16, w16, w17, w18
; CHECK-SD-NEXT:    mov v0.h[5], w16
; CHECK-SD-NEXT:    udiv w11, w3, w15
; CHECK-SD-NEXT:    msub w8, w25, w14, w13
; CHECK-SD-NEXT:    ldp w14, w13, [sp, #32] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #96] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v2.h[5], w8
; CHECK-SD-NEXT:    msub w13, w13, w14, w4
; CHECK-SD-NEXT:    mov v0.h[6], w13
; CHECK-SD-NEXT:    udiv w12, w10, w9
; CHECK-SD-NEXT:    msub w8, w11, w15, w3
; CHECK-SD-NEXT:    ldr w11, [sp, #16] // 4-byte Folded Reload
; CHECK-SD-NEXT:    msub w11, w11, w2, w6
; CHECK-SD-NEXT:    mov v2.h[6], w8
; CHECK-SD-NEXT:    mov v0.h[7], w11
; CHECK-SD-NEXT:    msub w8, w12, w9, w10
; CHECK-SD-NEXT:    mov v2.h[7], w8
; CHECK-SD-NEXT:    mov v1.16b, v2.16b
; CHECK-SD-NEXT:    add sp, sp, #160
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv16i16:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
; CHECK-GI-NEXT:    ushll v5.4s, v2.4h, #0
; CHECK-GI-NEXT:    ushll v6.4s, v1.4h, #0
; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
; CHECK-GI-NEXT:    fmov w8, s4
; CHECK-GI-NEXT:    fmov w9, s5
; CHECK-GI-NEXT:    mov w12, v5.s[3]
; CHECK-GI-NEXT:    fmov w17, s7
; CHECK-GI-NEXT:    mov w18, v7.s[1]
; CHECK-GI-NEXT:    mov w0, v7.s[2]
; CHECK-GI-NEXT:    mov w1, v7.s[3]
; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
; CHECK-GI-NEXT:    udiv w11, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[1]
; CHECK-GI-NEXT:    mov w9, v5.s[1]
; CHECK-GI-NEXT:    fmov w2, s7
; CHECK-GI-NEXT:    mov w3, v7.s[1]
; CHECK-GI-NEXT:    mov w4, v7.s[2]
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[2]
; CHECK-GI-NEXT:    mov w9, v5.s[2]
; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
; CHECK-GI-NEXT:    mov v16.s[0], w11
; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
; CHECK-GI-NEXT:    fmov w13, s5
; CHECK-GI-NEXT:    mov w14, v5.s[1]
; CHECK-GI-NEXT:    mov w15, v5.s[2]
; CHECK-GI-NEXT:    mov w16, v5.s[3]
; CHECK-GI-NEXT:    udiv w9, w8, w9
; CHECK-GI-NEXT:    mov w8, v4.s[3]
; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
; CHECK-GI-NEXT:    mov v16.s[1], w10
; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    fmov w12, s4
; CHECK-GI-NEXT:    mov v16.s[2], w9
; CHECK-GI-NEXT:    udiv w13, w12, w13
; CHECK-GI-NEXT:    mov w12, v4.s[1]
; CHECK-GI-NEXT:    mov v16.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
; CHECK-GI-NEXT:    udiv w14, w12, w14
; CHECK-GI-NEXT:    mov w12, v4.s[2]
; CHECK-GI-NEXT:    mov v17.s[0], w13
; CHECK-GI-NEXT:    mov w13, v7.s[3]
; CHECK-GI-NEXT:    udiv w15, w12, w15
; CHECK-GI-NEXT:    mov w12, v4.s[3]
; CHECK-GI-NEXT:    mov v17.s[1], w14
; CHECK-GI-NEXT:    udiv w12, w12, w16
; CHECK-GI-NEXT:    fmov w16, s6
; CHECK-GI-NEXT:    mov v17.s[2], w15
; CHECK-GI-NEXT:    udiv w16, w16, w17
; CHECK-GI-NEXT:    mov w17, v6.s[1]
; CHECK-GI-NEXT:    mov v17.s[3], w12
; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
; CHECK-GI-NEXT:    udiv w17, w17, w18
; CHECK-GI-NEXT:    mov w18, v6.s[2]
; CHECK-GI-NEXT:    mov v18.s[0], w16
; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
; CHECK-GI-NEXT:    udiv w18, w18, w0
; CHECK-GI-NEXT:    mov w0, v6.s[3]
; CHECK-GI-NEXT:    ushll2 v6.4s, v1.8h, #0
; CHECK-GI-NEXT:    mov v18.s[1], w17
; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
; CHECK-GI-NEXT:    mov w11, v6.s[3]
; CHECK-GI-NEXT:    udiv w0, w0, w1
; CHECK-GI-NEXT:    fmov w1, s6
; CHECK-GI-NEXT:    mov v18.s[2], w18
; CHECK-GI-NEXT:    udiv w1, w1, w2
; CHECK-GI-NEXT:    mov w2, v6.s[1]
; CHECK-GI-NEXT:    mov v18.s[3], w0
; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
; CHECK-GI-NEXT:    udiv w2, w2, w3
; CHECK-GI-NEXT:    mov w3, v6.s[2]
; CHECK-GI-NEXT:    mov v19.s[0], w1
; CHECK-GI-NEXT:    udiv w3, w3, w4
; CHECK-GI-NEXT:    mov v19.s[1], w2
; CHECK-GI-NEXT:    udiv w10, w11, w13
; CHECK-GI-NEXT:    mov v19.s[2], w3
; CHECK-GI-NEXT:    mov v19.s[3], w10
; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v6.8h
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <16 x i16> %d, %e
  ret <16 x i16> %s
}

define <2 x i32> @sv2i32(<2 x i32> %d, <2 x i32> %e) {
; CHECK-SD-LABEL: sv2i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv2i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <2 x i32> %d, %e
  ret <2 x i32> %s
}

define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: sv3i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov w11, s1
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    mov w8, v1.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    mov w14, v1.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w8
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv3i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov s2, v0.s[1]
; CHECK-GI-NEXT:    mov s3, v1.s[1]
; CHECK-GI-NEXT:    mov s0, v0.s[2]
; CHECK-GI-NEXT:    mov s1, v1.s[2]
; CHECK-GI-NEXT:    sdiv w10, w8, w9
; CHECK-GI-NEXT:    fmov w11, s2
; CHECK-GI-NEXT:    fmov w12, s3
; CHECK-GI-NEXT:    fmov w14, s0
; CHECK-GI-NEXT:    fmov w15, s1
; CHECK-GI-NEXT:    sdiv w13, w11, w12
; CHECK-GI-NEXT:    msub w8, w10, w9, w8
; CHECK-GI-NEXT:    mov v0.s[0], w8
; CHECK-GI-NEXT:    sdiv w9, w14, w15
; CHECK-GI-NEXT:    msub w8, w13, w12, w11
; CHECK-GI-NEXT:    mov v0.s[1], w8
; CHECK-GI-NEXT:    msub w8, w9, w15, w14
; CHECK-GI-NEXT:    mov v0.s[2], w8
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <3 x i32> %d, %e
  ret <3 x i32> %s
}

define <4 x i32> @sv4i32(<4 x i32> %d, <4 x i32> %e) {
; CHECK-SD-LABEL: sv4i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov w11, s1
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    mov w8, v1.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    mov w14, v1.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    mov w17, v1.s[3]
; CHECK-SD-NEXT:    mov w18, v0.s[3]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    sdiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.s[3], w8
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv4i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    sdiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <4 x i32> %d, %e
  ret <4 x i32> %s
}

define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) {
; CHECK-SD-LABEL: sv8i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    stp x22, x21, [sp, #-32]! // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    mov w8, v2.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    fmov w11, s2
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    fmov w4, s3
; CHECK-SD-NEXT:    fmov w5, s1
; CHECK-SD-NEXT:    mov w1, v3.s[1]
; CHECK-SD-NEXT:    mov w2, v1.s[1]
; CHECK-SD-NEXT:    mov w14, v2.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    mov w7, v3.s[2]
; CHECK-SD-NEXT:    mov w19, v1.s[2]
; CHECK-SD-NEXT:    sdiv w10, w9, w8
; CHECK-SD-NEXT:    mov w17, v2.s[3]
; CHECK-SD-NEXT:    mov w18, v0.s[3]
; CHECK-SD-NEXT:    mov w21, v3.s[3]
; CHECK-SD-NEXT:    mov w22, v1.s[3]
; CHECK-SD-NEXT:    sdiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    sdiv w6, w5, w4
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    sdiv w3, w2, w1
; CHECK-SD-NEXT:    msub w10, w6, w4, w5
; CHECK-SD-NEXT:    fmov s1, w10
; CHECK-SD-NEXT:    sdiv w16, w15, w14
; CHECK-SD-NEXT:    msub w11, w3, w1, w2
; CHECK-SD-NEXT:    mov v1.s[1], w11
; CHECK-SD-NEXT:    sdiv w20, w19, w7
; CHECK-SD-NEXT:    msub w9, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w9
; CHECK-SD-NEXT:    sdiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w20, w7, w19
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v1.s[2], w8
; CHECK-SD-NEXT:    sdiv w12, w22, w21
; CHECK-SD-NEXT:    msub w10, w0, w17, w18
; CHECK-SD-NEXT:    mov v0.s[3], w10
; CHECK-SD-NEXT:    msub w8, w12, w21, w22
; CHECK-SD-NEXT:    mov v1.s[3], w8
; CHECK-SD-NEXT:    ldp x22, x21, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv8i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s2
; CHECK-GI-NEXT:    mov w10, v2.s[1]
; CHECK-GI-NEXT:    mov w11, v2.s[2]
; CHECK-GI-NEXT:    mov w12, v2.s[3]
; CHECK-GI-NEXT:    fmov w13, s3
; CHECK-GI-NEXT:    mov w14, v3.s[1]
; CHECK-GI-NEXT:    mov w15, v3.s[2]
; CHECK-GI-NEXT:    sdiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    sdiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v1.s[3]
; CHECK-GI-NEXT:    sdiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    sdiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s1
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    sdiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v1.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
; CHECK-GI-NEXT:    sdiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v1.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    sdiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    sdiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <8 x i32> %d, %e
  ret <8 x i32> %s
}

define <2 x i32> @uv2i32(<2 x i32> %d, <2 x i32> %e) {
; CHECK-SD-LABEL: uv2i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    fmov w8, s1
; CHECK-SD-NEXT:    fmov w9, s0
; CHECK-SD-NEXT:    mov w11, v1.s[1]
; CHECK-SD-NEXT:    mov w12, v0.s[1]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    fmov s0, w8
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    mov v0.s[1], w9
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv2i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <2 x i32> %d, %e
  ret <2 x i32> %s
}

define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-LABEL: uv3i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov w11, s1
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    mov w8, v1.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    mov w14, v1.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w8
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv3i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov s2, v0.s[1]
; CHECK-GI-NEXT:    mov s3, v1.s[1]
; CHECK-GI-NEXT:    mov s0, v0.s[2]
; CHECK-GI-NEXT:    mov s1, v1.s[2]
; CHECK-GI-NEXT:    udiv w10, w8, w9
; CHECK-GI-NEXT:    fmov w11, s2
; CHECK-GI-NEXT:    fmov w12, s3
; CHECK-GI-NEXT:    fmov w14, s0
; CHECK-GI-NEXT:    fmov w15, s1
; CHECK-GI-NEXT:    udiv w13, w11, w12
; CHECK-GI-NEXT:    msub w8, w10, w9, w8
; CHECK-GI-NEXT:    mov v0.s[0], w8
; CHECK-GI-NEXT:    udiv w9, w14, w15
; CHECK-GI-NEXT:    msub w8, w13, w12, w11
; CHECK-GI-NEXT:    mov v0.s[1], w8
; CHECK-GI-NEXT:    msub w8, w9, w15, w14
; CHECK-GI-NEXT:    mov v0.s[2], w8
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <3 x i32> %d, %e
  ret <3 x i32> %s
}

define <4 x i32> @uv4i32(<4 x i32> %d, <4 x i32> %e) {
; CHECK-SD-LABEL: uv4i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov w11, s1
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    mov w8, v1.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    mov w14, v1.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    mov w17, v1.s[3]
; CHECK-SD-NEXT:    mov w18, v0.s[3]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    msub w11, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w11
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    udiv w9, w18, w17
; CHECK-SD-NEXT:    msub w8, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w8
; CHECK-SD-NEXT:    msub w8, w9, w17, w18
; CHECK-SD-NEXT:    mov v0.s[3], w8
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv4i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s1
; CHECK-GI-NEXT:    mov w10, v1.s[1]
; CHECK-GI-NEXT:    mov w11, v1.s[2]
; CHECK-GI-NEXT:    mov w12, v1.s[3]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v2.s[0], w8
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v2.s[1], w9
; CHECK-GI-NEXT:    udiv w8, w11, w12
; CHECK-GI-NEXT:    mov v2.s[2], w10
; CHECK-GI-NEXT:    mov v2.s[3], w8
; CHECK-GI-NEXT:    mls v0.4s, v2.4s, v1.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <4 x i32> %d, %e
  ret <4 x i32> %s
}

define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) {
; CHECK-SD-LABEL: uv8i32:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    stp x22, x21, [sp, #-32]! // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    mov w8, v2.s[1]
; CHECK-SD-NEXT:    mov w9, v0.s[1]
; CHECK-SD-NEXT:    fmov w11, s2
; CHECK-SD-NEXT:    fmov w12, s0
; CHECK-SD-NEXT:    fmov w4, s3
; CHECK-SD-NEXT:    fmov w5, s1
; CHECK-SD-NEXT:    mov w1, v3.s[1]
; CHECK-SD-NEXT:    mov w2, v1.s[1]
; CHECK-SD-NEXT:    mov w14, v2.s[2]
; CHECK-SD-NEXT:    mov w15, v0.s[2]
; CHECK-SD-NEXT:    mov w7, v3.s[2]
; CHECK-SD-NEXT:    mov w19, v1.s[2]
; CHECK-SD-NEXT:    udiv w10, w9, w8
; CHECK-SD-NEXT:    mov w17, v2.s[3]
; CHECK-SD-NEXT:    mov w18, v0.s[3]
; CHECK-SD-NEXT:    mov w21, v3.s[3]
; CHECK-SD-NEXT:    mov w22, v1.s[3]
; CHECK-SD-NEXT:    udiv w13, w12, w11
; CHECK-SD-NEXT:    msub w8, w10, w8, w9
; CHECK-SD-NEXT:    udiv w6, w5, w4
; CHECK-SD-NEXT:    msub w9, w13, w11, w12
; CHECK-SD-NEXT:    fmov s0, w9
; CHECK-SD-NEXT:    mov v0.s[1], w8
; CHECK-SD-NEXT:    udiv w3, w2, w1
; CHECK-SD-NEXT:    msub w10, w6, w4, w5
; CHECK-SD-NEXT:    fmov s1, w10
; CHECK-SD-NEXT:    udiv w16, w15, w14
; CHECK-SD-NEXT:    msub w11, w3, w1, w2
; CHECK-SD-NEXT:    mov v1.s[1], w11
; CHECK-SD-NEXT:    udiv w20, w19, w7
; CHECK-SD-NEXT:    msub w9, w16, w14, w15
; CHECK-SD-NEXT:    mov v0.s[2], w9
; CHECK-SD-NEXT:    udiv w0, w18, w17
; CHECK-SD-NEXT:    msub w8, w20, w7, w19
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov v1.s[2], w8
; CHECK-SD-NEXT:    udiv w12, w22, w21
; CHECK-SD-NEXT:    msub w10, w0, w17, w18
; CHECK-SD-NEXT:    mov v0.s[3], w10
; CHECK-SD-NEXT:    msub w8, w12, w21, w22
; CHECK-SD-NEXT:    mov v1.s[3], w8
; CHECK-SD-NEXT:    ldp x22, x21, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv8i32:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov w8, s0
; CHECK-GI-NEXT:    fmov w9, s2
; CHECK-GI-NEXT:    mov w10, v2.s[1]
; CHECK-GI-NEXT:    mov w11, v2.s[2]
; CHECK-GI-NEXT:    mov w12, v2.s[3]
; CHECK-GI-NEXT:    fmov w13, s3
; CHECK-GI-NEXT:    mov w14, v3.s[1]
; CHECK-GI-NEXT:    mov w15, v3.s[2]
; CHECK-GI-NEXT:    udiv w8, w8, w9
; CHECK-GI-NEXT:    mov w9, v0.s[1]
; CHECK-GI-NEXT:    udiv w9, w9, w10
; CHECK-GI-NEXT:    mov w10, v0.s[2]
; CHECK-GI-NEXT:    mov v4.s[0], w8
; CHECK-GI-NEXT:    mov w8, v1.s[3]
; CHECK-GI-NEXT:    udiv w10, w10, w11
; CHECK-GI-NEXT:    mov w11, v0.s[3]
; CHECK-GI-NEXT:    mov v4.s[1], w9
; CHECK-GI-NEXT:    udiv w11, w11, w12
; CHECK-GI-NEXT:    fmov w12, s1
; CHECK-GI-NEXT:    mov v4.s[2], w10
; CHECK-GI-NEXT:    udiv w12, w12, w13
; CHECK-GI-NEXT:    mov w13, v1.s[1]
; CHECK-GI-NEXT:    mov v4.s[3], w11
; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
; CHECK-GI-NEXT:    udiv w13, w13, w14
; CHECK-GI-NEXT:    mov w14, v1.s[2]
; CHECK-GI-NEXT:    mov v5.s[0], w12
; CHECK-GI-NEXT:    mov w12, v3.s[3]
; CHECK-GI-NEXT:    udiv w14, w14, w15
; CHECK-GI-NEXT:    mov v5.s[1], w13
; CHECK-GI-NEXT:    udiv w8, w8, w12
; CHECK-GI-NEXT:    mov v5.s[2], w14
; CHECK-GI-NEXT:    mov v5.s[3], w8
; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <8 x i32> %d, %e
  ret <8 x i32> %s
}

define <2 x i64> @sv2i64(<2 x i64> %d, <2 x i64> %e) {
; CHECK-SD-LABEL: sv2i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov x8, d1
; CHECK-SD-NEXT:    fmov x9, d0
; CHECK-SD-NEXT:    mov x11, v1.d[1]
; CHECK-SD-NEXT:    mov x12, v0.d[1]
; CHECK-SD-NEXT:    sdiv x10, x9, x8
; CHECK-SD-NEXT:    sdiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    fmov d0, x8
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    mov v0.d[1], x9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv2i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d1
; CHECK-GI-NEXT:    mov x10, v1.d[1]
; CHECK-GI-NEXT:    mov x11, v0.d[1]
; CHECK-GI-NEXT:    sdiv x8, x8, x9
; CHECK-GI-NEXT:    sdiv x11, x11, x10
; CHECK-GI-NEXT:    mov v1.d[0], x8
; CHECK-GI-NEXT:    mov v1.d[1], x11
; CHECK-GI-NEXT:    fmov x8, d1
; CHECK-GI-NEXT:    mov x11, v1.d[1]
; CHECK-GI-NEXT:    mul x8, x8, x9
; CHECK-GI-NEXT:    mul x9, x11, x10
; CHECK-GI-NEXT:    mov v1.d[0], x8
; CHECK-GI-NEXT:    mov v1.d[1], x9
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <2 x i64> %d, %e
  ret <2 x i64> %s
}

define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-SD-LABEL: sv3i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    fmov x8, d3
; CHECK-SD-NEXT:    fmov x9, d0
; CHECK-SD-NEXT:    fmov x11, d4
; CHECK-SD-NEXT:    fmov x12, d1
; CHECK-SD-NEXT:    fmov x14, d5
; CHECK-SD-NEXT:    fmov x15, d2
; CHECK-SD-NEXT:    sdiv x10, x9, x8
; CHECK-SD-NEXT:    sdiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    fmov d0, x8
; CHECK-SD-NEXT:    sdiv x16, x15, x14
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    fmov d1, x9
; CHECK-SD-NEXT:    msub x10, x16, x14, x15
; CHECK-SD-NEXT:    fmov d2, x10
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv3i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d3
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
; CHECK-GI-NEXT:    fmov x10, d4
; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    sdiv x8, x8, x9
; CHECK-GI-NEXT:    fmov x9, d1
; CHECK-GI-NEXT:    fmov x11, d3
; CHECK-GI-NEXT:    mov x14, v3.d[1]
; CHECK-GI-NEXT:    sdiv x9, x9, x10
; CHECK-GI-NEXT:    mov v6.d[0], x8
; CHECK-GI-NEXT:    fmov x8, d2
; CHECK-GI-NEXT:    mov v6.d[1], x9
; CHECK-GI-NEXT:    fmov x9, d5
; CHECK-GI-NEXT:    sdiv x12, x8, x9
; CHECK-GI-NEXT:    fmov x10, d6
; CHECK-GI-NEXT:    mov x13, v6.d[1]
; CHECK-GI-NEXT:    mul x10, x10, x11
; CHECK-GI-NEXT:    mul x11, x13, x14
; CHECK-GI-NEXT:    mov v2.d[0], x10
; CHECK-GI-NEXT:    mov v2.d[1], x11
; CHECK-GI-NEXT:    msub x8, x12, x9, x8
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
; CHECK-GI-NEXT:    mov d1, v0.d[1]
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    fmov d2, x8
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <3 x i64> %d, %e
  ret <3 x i64> %s
}

define <4 x i64> @sv4i64(<4 x i64> %d, <4 x i64> %e) {
; CHECK-SD-LABEL: sv4i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    mov x8, v2.d[1]
; CHECK-SD-NEXT:    mov x9, v0.d[1]
; CHECK-SD-NEXT:    fmov x11, d2
; CHECK-SD-NEXT:    fmov x12, d0
; CHECK-SD-NEXT:    fmov x14, d3
; CHECK-SD-NEXT:    fmov x15, d1
; CHECK-SD-NEXT:    mov x17, v3.d[1]
; CHECK-SD-NEXT:    mov x18, v1.d[1]
; CHECK-SD-NEXT:    sdiv x10, x9, x8
; CHECK-SD-NEXT:    sdiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    sdiv x16, x15, x14
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    fmov d0, x9
; CHECK-SD-NEXT:    mov v0.d[1], x8
; CHECK-SD-NEXT:    sdiv x0, x18, x17
; CHECK-SD-NEXT:    msub x10, x16, x14, x15
; CHECK-SD-NEXT:    fmov d1, x10
; CHECK-SD-NEXT:    msub x11, x0, x17, x18
; CHECK-SD-NEXT:    mov v1.d[1], x11
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv4i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d2
; CHECK-GI-NEXT:    mov x10, v2.d[1]
; CHECK-GI-NEXT:    mov x11, v0.d[1]
; CHECK-GI-NEXT:    fmov x12, d1
; CHECK-GI-NEXT:    fmov x13, d3
; CHECK-GI-NEXT:    mov x14, v3.d[1]
; CHECK-GI-NEXT:    mov x15, v1.d[1]
; CHECK-GI-NEXT:    sdiv x8, x8, x9
; CHECK-GI-NEXT:    sdiv x12, x12, x13
; CHECK-GI-NEXT:    mov v2.d[0], x8
; CHECK-GI-NEXT:    sdiv x11, x11, x10
; CHECK-GI-NEXT:    mov v3.d[0], x12
; CHECK-GI-NEXT:    sdiv x15, x15, x14
; CHECK-GI-NEXT:    mov v2.d[1], x11
; CHECK-GI-NEXT:    fmov x8, d2
; CHECK-GI-NEXT:    mov x11, v2.d[1]
; CHECK-GI-NEXT:    mul x8, x8, x9
; CHECK-GI-NEXT:    mul x10, x11, x10
; CHECK-GI-NEXT:    mov v2.d[0], x8
; CHECK-GI-NEXT:    mov v3.d[1], x15
; CHECK-GI-NEXT:    mov v2.d[1], x10
; CHECK-GI-NEXT:    fmov x9, d3
; CHECK-GI-NEXT:    mov x12, v3.d[1]
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
; CHECK-GI-NEXT:    mul x9, x9, x13
; CHECK-GI-NEXT:    mul x11, x12, x14
; CHECK-GI-NEXT:    mov v3.d[0], x9
; CHECK-GI-NEXT:    mov v3.d[1], x11
; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <4 x i64> %d, %e
  ret <4 x i64> %s
}

define <2 x i64> @uv2i64(<2 x i64> %d, <2 x i64> %e) {
; CHECK-SD-LABEL: uv2i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    fmov x8, d1
; CHECK-SD-NEXT:    fmov x9, d0
; CHECK-SD-NEXT:    mov x11, v1.d[1]
; CHECK-SD-NEXT:    mov x12, v0.d[1]
; CHECK-SD-NEXT:    udiv x10, x9, x8
; CHECK-SD-NEXT:    udiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    fmov d0, x8
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    mov v0.d[1], x9
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv2i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d1
; CHECK-GI-NEXT:    mov x10, v1.d[1]
; CHECK-GI-NEXT:    mov x11, v0.d[1]
; CHECK-GI-NEXT:    udiv x8, x8, x9
; CHECK-GI-NEXT:    udiv x11, x11, x10
; CHECK-GI-NEXT:    mov v1.d[0], x8
; CHECK-GI-NEXT:    mov v1.d[1], x11
; CHECK-GI-NEXT:    fmov x8, d1
; CHECK-GI-NEXT:    mov x11, v1.d[1]
; CHECK-GI-NEXT:    mul x8, x8, x9
; CHECK-GI-NEXT:    mul x9, x11, x10
; CHECK-GI-NEXT:    mov v1.d[0], x8
; CHECK-GI-NEXT:    mov v1.d[1], x9
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <2 x i64> %d, %e
  ret <2 x i64> %s
}

define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) {
; CHECK-SD-LABEL: uv3i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    // kill: def $d5 killed $d5 def $q5
; CHECK-SD-NEXT:    // kill: def $d4 killed $d4 def $q4
; CHECK-SD-NEXT:    // kill: def $d3 killed $d3 def $q3
; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT:    fmov x8, d3
; CHECK-SD-NEXT:    fmov x9, d0
; CHECK-SD-NEXT:    fmov x11, d4
; CHECK-SD-NEXT:    fmov x12, d1
; CHECK-SD-NEXT:    fmov x14, d5
; CHECK-SD-NEXT:    fmov x15, d2
; CHECK-SD-NEXT:    udiv x10, x9, x8
; CHECK-SD-NEXT:    udiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    fmov d0, x8
; CHECK-SD-NEXT:    udiv x16, x15, x14
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    fmov d1, x9
; CHECK-SD-NEXT:    msub x10, x16, x14, x15
; CHECK-SD-NEXT:    fmov d2, x10
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv3i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d3
; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
; CHECK-GI-NEXT:    fmov x10, d4
; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT:    udiv x8, x8, x9
; CHECK-GI-NEXT:    fmov x9, d1
; CHECK-GI-NEXT:    fmov x11, d3
; CHECK-GI-NEXT:    mov x14, v3.d[1]
; CHECK-GI-NEXT:    udiv x9, x9, x10
; CHECK-GI-NEXT:    mov v6.d[0], x8
; CHECK-GI-NEXT:    fmov x8, d2
; CHECK-GI-NEXT:    mov v6.d[1], x9
; CHECK-GI-NEXT:    fmov x9, d5
; CHECK-GI-NEXT:    udiv x12, x8, x9
; CHECK-GI-NEXT:    fmov x10, d6
; CHECK-GI-NEXT:    mov x13, v6.d[1]
; CHECK-GI-NEXT:    mul x10, x10, x11
; CHECK-GI-NEXT:    mul x11, x13, x14
; CHECK-GI-NEXT:    mov v2.d[0], x10
; CHECK-GI-NEXT:    mov v2.d[1], x11
; CHECK-GI-NEXT:    msub x8, x12, x9, x8
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
; CHECK-GI-NEXT:    mov d1, v0.d[1]
; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT:    fmov d2, x8
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <3 x i64> %d, %e
  ret <3 x i64> %s
}

define <4 x i64> @uv4i64(<4 x i64> %d, <4 x i64> %e) {
; CHECK-SD-LABEL: uv4i64:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    mov x8, v2.d[1]
; CHECK-SD-NEXT:    mov x9, v0.d[1]
; CHECK-SD-NEXT:    fmov x11, d2
; CHECK-SD-NEXT:    fmov x12, d0
; CHECK-SD-NEXT:    fmov x14, d3
; CHECK-SD-NEXT:    fmov x15, d1
; CHECK-SD-NEXT:    mov x17, v3.d[1]
; CHECK-SD-NEXT:    mov x18, v1.d[1]
; CHECK-SD-NEXT:    udiv x10, x9, x8
; CHECK-SD-NEXT:    udiv x13, x12, x11
; CHECK-SD-NEXT:    msub x8, x10, x8, x9
; CHECK-SD-NEXT:    udiv x16, x15, x14
; CHECK-SD-NEXT:    msub x9, x13, x11, x12
; CHECK-SD-NEXT:    fmov d0, x9
; CHECK-SD-NEXT:    mov v0.d[1], x8
; CHECK-SD-NEXT:    udiv x0, x18, x17
; CHECK-SD-NEXT:    msub x10, x16, x14, x15
; CHECK-SD-NEXT:    fmov d1, x10
; CHECK-SD-NEXT:    msub x11, x0, x17, x18
; CHECK-SD-NEXT:    mov v1.d[1], x11
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv4i64:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    fmov x8, d0
; CHECK-GI-NEXT:    fmov x9, d2
; CHECK-GI-NEXT:    mov x10, v2.d[1]
; CHECK-GI-NEXT:    mov x11, v0.d[1]
; CHECK-GI-NEXT:    fmov x12, d1
; CHECK-GI-NEXT:    fmov x13, d3
; CHECK-GI-NEXT:    mov x14, v3.d[1]
; CHECK-GI-NEXT:    mov x15, v1.d[1]
; CHECK-GI-NEXT:    udiv x8, x8, x9
; CHECK-GI-NEXT:    udiv x12, x12, x13
; CHECK-GI-NEXT:    mov v2.d[0], x8
; CHECK-GI-NEXT:    udiv x11, x11, x10
; CHECK-GI-NEXT:    mov v3.d[0], x12
; CHECK-GI-NEXT:    udiv x15, x15, x14
; CHECK-GI-NEXT:    mov v2.d[1], x11
; CHECK-GI-NEXT:    fmov x8, d2
; CHECK-GI-NEXT:    mov x11, v2.d[1]
; CHECK-GI-NEXT:    mul x8, x8, x9
; CHECK-GI-NEXT:    mul x10, x11, x10
; CHECK-GI-NEXT:    mov v2.d[0], x8
; CHECK-GI-NEXT:    mov v3.d[1], x15
; CHECK-GI-NEXT:    mov v2.d[1], x10
; CHECK-GI-NEXT:    fmov x9, d3
; CHECK-GI-NEXT:    mov x12, v3.d[1]
; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
; CHECK-GI-NEXT:    mul x9, x9, x13
; CHECK-GI-NEXT:    mul x11, x12, x14
; CHECK-GI-NEXT:    mov v3.d[0], x9
; CHECK-GI-NEXT:    mov v3.d[1], x11
; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <4 x i64> %d, %e
  ret <4 x i64> %s
}

define <2 x i128> @sv2i128(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: sv2i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w30, -64
; CHECK-SD-NEXT:    mov x21, x3
; CHECK-SD-NEXT:    mov x22, x2
; CHECK-SD-NEXT:    mov x2, x4
; CHECK-SD-NEXT:    mov x3, x5
; CHECK-SD-NEXT:    mov x19, x7
; CHECK-SD-NEXT:    mov x20, x6
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x23, x0
; CHECK-SD-NEXT:    mov x24, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x20
; CHECK-SD-NEXT:    mov x3, x19
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x2, x0
; CHECK-SD-NEXT:    mov x3, x1
; CHECK-SD-NEXT:    mov x0, x23
; CHECK-SD-NEXT:    mov x1, x24
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv2i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w30, -64
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x2, x4
; CHECK-GI-NEXT:    mov x3, x5
; CHECK-GI-NEXT:    mov x21, x6
; CHECK-GI-NEXT:    mov x22, x7
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x23, x0
; CHECK-GI-NEXT:    mov x24, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x21
; CHECK-GI-NEXT:    mov x3, x22
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x2, x0
; CHECK-GI-NEXT:    mov x3, x1
; CHECK-GI-NEXT:    mov x0, x23
; CHECK-GI-NEXT:    mov x1, x24
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <2 x i128> %d, %e
  ret <2 x i128> %s
}

define <3 x i128> @sv3i128(<3 x i128> %d, <3 x i128> %e) {
; CHECK-SD-LABEL: sv3i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -96
; CHECK-SD-NEXT:    ldp x23, x24, [sp, #112]
; CHECK-SD-NEXT:    mov x21, x3
; CHECK-SD-NEXT:    ldp x25, x26, [sp, #96]
; CHECK-SD-NEXT:    mov x22, x2
; CHECK-SD-NEXT:    mov x2, x6
; CHECK-SD-NEXT:    mov x3, x7
; CHECK-SD-NEXT:    mov x19, x5
; CHECK-SD-NEXT:    mov x20, x4
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x27, x0
; CHECK-SD-NEXT:    mov x28, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x25
; CHECK-SD-NEXT:    mov x3, x26
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x21, x0
; CHECK-SD-NEXT:    mov x22, x1
; CHECK-SD-NEXT:    mov x0, x20
; CHECK-SD-NEXT:    mov x1, x19
; CHECK-SD-NEXT:    mov x2, x23
; CHECK-SD-NEXT:    mov x3, x24
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x4, x0
; CHECK-SD-NEXT:    mov x5, x1
; CHECK-SD-NEXT:    mov x0, x27
; CHECK-SD-NEXT:    mov x1, x28
; CHECK-SD-NEXT:    mov x2, x21
; CHECK-SD-NEXT:    mov x3, x22
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv3i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -96
; CHECK-GI-NEXT:    ldp x23, x24, [sp, #96]
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    ldp x25, x26, [sp, #112]
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x2, x6
; CHECK-GI-NEXT:    mov x3, x7
; CHECK-GI-NEXT:    mov x21, x4
; CHECK-GI-NEXT:    mov x22, x5
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x27, x0
; CHECK-GI-NEXT:    mov x28, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x23
; CHECK-GI-NEXT:    mov x3, x24
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x19, x0
; CHECK-GI-NEXT:    mov x20, x1
; CHECK-GI-NEXT:    mov x0, x21
; CHECK-GI-NEXT:    mov x1, x22
; CHECK-GI-NEXT:    mov x2, x25
; CHECK-GI-NEXT:    mov x3, x26
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x4, x0
; CHECK-GI-NEXT:    mov x5, x1
; CHECK-GI-NEXT:    mov x0, x27
; CHECK-GI-NEXT:    mov x1, x28
; CHECK-GI-NEXT:    mov x2, x19
; CHECK-GI-NEXT:    mov x3, x20
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <3 x i128> %d, %e
  ret <3 x i128> %s
}

define <4 x i128> @sv4i128(<4 x i128> %d, <4 x i128> %e) {
; CHECK-SD-LABEL: sv4i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #128
; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 128
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    mov x23, x3
; CHECK-SD-NEXT:    mov x24, x2
; CHECK-SD-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    ldp x8, x26, [sp, #176]
; CHECK-SD-NEXT:    mov x21, x5
; CHECK-SD-NEXT:    ldp x2, x3, [sp, #128]
; CHECK-SD-NEXT:    mov x22, x4
; CHECK-SD-NEXT:    ldp x27, x28, [sp, #160]
; CHECK-SD-NEXT:    ldp x29, x19, [sp, #144]
; CHECK-SD-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x20, x0
; CHECK-SD-NEXT:    mov x25, x1
; CHECK-SD-NEXT:    mov x0, x24
; CHECK-SD-NEXT:    mov x1, x23
; CHECK-SD-NEXT:    mov x2, x29
; CHECK-SD-NEXT:    mov x3, x19
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x19, x0
; CHECK-SD-NEXT:    mov x23, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x27
; CHECK-SD-NEXT:    mov x3, x28
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x21, x0
; CHECK-SD-NEXT:    mov x22, x1
; CHECK-SD-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov x3, x26
; CHECK-SD-NEXT:    bl __modti3
; CHECK-SD-NEXT:    mov x6, x0
; CHECK-SD-NEXT:    mov x7, x1
; CHECK-SD-NEXT:    mov x0, x20
; CHECK-SD-NEXT:    mov x1, x25
; CHECK-SD-NEXT:    mov x2, x19
; CHECK-SD-NEXT:    mov x3, x23
; CHECK-SD-NEXT:    mov x4, x21
; CHECK-SD-NEXT:    mov x5, x22
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    add sp, sp, #128
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: sv4i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sub sp, sp, #128
; CHECK-GI-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 128
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -88
; CHECK-GI-NEXT:    .cfi_offset w29, -96
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x21, x4
; CHECK-GI-NEXT:    ldp x2, x3, [sp, #128]
; CHECK-GI-NEXT:    mov x22, x5
; CHECK-GI-NEXT:    ldp x9, x8, [sp, #176]
; CHECK-GI-NEXT:    mov x23, x7
; CHECK-GI-NEXT:    ldp x24, x25, [sp, #144]
; CHECK-GI-NEXT:    ldp x26, x27, [sp, #160]
; CHECK-GI-NEXT:    stp x9, x6, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x28, x0
; CHECK-GI-NEXT:    mov x29, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x24
; CHECK-GI-NEXT:    mov x3, x25
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x19, x0
; CHECK-GI-NEXT:    mov x20, x1
; CHECK-GI-NEXT:    mov x0, x21
; CHECK-GI-NEXT:    mov x1, x22
; CHECK-GI-NEXT:    mov x2, x26
; CHECK-GI-NEXT:    mov x3, x27
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x21, x0
; CHECK-GI-NEXT:    ldp x2, x0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
; CHECK-GI-NEXT:    mov x22, x1
; CHECK-GI-NEXT:    mov x1, x23
; CHECK-GI-NEXT:    bl __modti3
; CHECK-GI-NEXT:    mov x6, x0
; CHECK-GI-NEXT:    mov x7, x1
; CHECK-GI-NEXT:    mov x0, x28
; CHECK-GI-NEXT:    mov x1, x29
; CHECK-GI-NEXT:    mov x2, x19
; CHECK-GI-NEXT:    mov x3, x20
; CHECK-GI-NEXT:    mov x4, x21
; CHECK-GI-NEXT:    mov x5, x22
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    add sp, sp, #128
; CHECK-GI-NEXT:    ret
entry:
  %s = srem <4 x i128> %d, %e
  ret <4 x i128> %s
}

define <2 x i128> @uv2i128(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w30, -64
; CHECK-SD-NEXT:    mov x21, x3
; CHECK-SD-NEXT:    mov x22, x2
; CHECK-SD-NEXT:    mov x2, x4
; CHECK-SD-NEXT:    mov x3, x5
; CHECK-SD-NEXT:    mov x19, x7
; CHECK-SD-NEXT:    mov x20, x6
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x23, x0
; CHECK-SD-NEXT:    mov x24, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x20
; CHECK-SD-NEXT:    mov x3, x19
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x2, x0
; CHECK-SD-NEXT:    mov x3, x1
; CHECK-SD-NEXT:    mov x0, x23
; CHECK-SD-NEXT:    mov x1, x24
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv2i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w30, -64
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x2, x4
; CHECK-GI-NEXT:    mov x3, x5
; CHECK-GI-NEXT:    mov x21, x6
; CHECK-GI-NEXT:    mov x22, x7
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x23, x0
; CHECK-GI-NEXT:    mov x24, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x21
; CHECK-GI-NEXT:    mov x3, x22
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x2, x0
; CHECK-GI-NEXT:    mov x3, x1
; CHECK-GI-NEXT:    mov x0, x23
; CHECK-GI-NEXT:    mov x1, x24
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <2 x i128> %d, %e
  ret <2 x i128> %s
}

define <3 x i128> @uv3i128(<3 x i128> %d, <3 x i128> %e) {
; CHECK-SD-LABEL: uv3i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 96
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -96
; CHECK-SD-NEXT:    ldp x23, x24, [sp, #112]
; CHECK-SD-NEXT:    mov x21, x3
; CHECK-SD-NEXT:    ldp x25, x26, [sp, #96]
; CHECK-SD-NEXT:    mov x22, x2
; CHECK-SD-NEXT:    mov x2, x6
; CHECK-SD-NEXT:    mov x3, x7
; CHECK-SD-NEXT:    mov x19, x5
; CHECK-SD-NEXT:    mov x20, x4
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x27, x0
; CHECK-SD-NEXT:    mov x28, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x25
; CHECK-SD-NEXT:    mov x3, x26
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x21, x0
; CHECK-SD-NEXT:    mov x22, x1
; CHECK-SD-NEXT:    mov x0, x20
; CHECK-SD-NEXT:    mov x1, x19
; CHECK-SD-NEXT:    mov x2, x23
; CHECK-SD-NEXT:    mov x3, x24
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x4, x0
; CHECK-SD-NEXT:    mov x5, x1
; CHECK-SD-NEXT:    mov x0, x27
; CHECK-SD-NEXT:    mov x1, x28
; CHECK-SD-NEXT:    mov x2, x21
; CHECK-SD-NEXT:    mov x3, x22
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv3i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -96
; CHECK-GI-NEXT:    ldp x23, x24, [sp, #96]
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    ldp x25, x26, [sp, #112]
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x2, x6
; CHECK-GI-NEXT:    mov x3, x7
; CHECK-GI-NEXT:    mov x21, x4
; CHECK-GI-NEXT:    mov x22, x5
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x27, x0
; CHECK-GI-NEXT:    mov x28, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x23
; CHECK-GI-NEXT:    mov x3, x24
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x19, x0
; CHECK-GI-NEXT:    mov x20, x1
; CHECK-GI-NEXT:    mov x0, x21
; CHECK-GI-NEXT:    mov x1, x22
; CHECK-GI-NEXT:    mov x2, x25
; CHECK-GI-NEXT:    mov x3, x26
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x4, x0
; CHECK-GI-NEXT:    mov x5, x1
; CHECK-GI-NEXT:    mov x0, x27
; CHECK-GI-NEXT:    mov x1, x28
; CHECK-GI-NEXT:    mov x2, x19
; CHECK-GI-NEXT:    mov x3, x20
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x30, [sp], #96 // 8-byte Folded Reload
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <3 x i128> %d, %e
  ret <3 x i128> %s
}

define <4 x i128> @uv4i128(<4 x i128> %d, <4 x i128> %e) {
; CHECK-SD-LABEL: uv4i128:
; CHECK-SD:       // %bb.0: // %entry
; CHECK-SD-NEXT:    sub sp, sp, #128
; CHECK-SD-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
; CHECK-SD-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
; CHECK-SD-NEXT:    .cfi_def_cfa_offset 128
; CHECK-SD-NEXT:    .cfi_offset w19, -8
; CHECK-SD-NEXT:    .cfi_offset w20, -16
; CHECK-SD-NEXT:    .cfi_offset w21, -24
; CHECK-SD-NEXT:    .cfi_offset w22, -32
; CHECK-SD-NEXT:    .cfi_offset w23, -40
; CHECK-SD-NEXT:    .cfi_offset w24, -48
; CHECK-SD-NEXT:    .cfi_offset w25, -56
; CHECK-SD-NEXT:    .cfi_offset w26, -64
; CHECK-SD-NEXT:    .cfi_offset w27, -72
; CHECK-SD-NEXT:    .cfi_offset w28, -80
; CHECK-SD-NEXT:    .cfi_offset w30, -88
; CHECK-SD-NEXT:    .cfi_offset w29, -96
; CHECK-SD-NEXT:    mov x23, x3
; CHECK-SD-NEXT:    mov x24, x2
; CHECK-SD-NEXT:    stp x6, x7, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT:    ldp x8, x26, [sp, #176]
; CHECK-SD-NEXT:    mov x21, x5
; CHECK-SD-NEXT:    ldp x2, x3, [sp, #128]
; CHECK-SD-NEXT:    mov x22, x4
; CHECK-SD-NEXT:    ldp x27, x28, [sp, #160]
; CHECK-SD-NEXT:    ldp x29, x19, [sp, #144]
; CHECK-SD-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x20, x0
; CHECK-SD-NEXT:    mov x25, x1
; CHECK-SD-NEXT:    mov x0, x24
; CHECK-SD-NEXT:    mov x1, x23
; CHECK-SD-NEXT:    mov x2, x29
; CHECK-SD-NEXT:    mov x3, x19
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x19, x0
; CHECK-SD-NEXT:    mov x23, x1
; CHECK-SD-NEXT:    mov x0, x22
; CHECK-SD-NEXT:    mov x1, x21
; CHECK-SD-NEXT:    mov x2, x27
; CHECK-SD-NEXT:    mov x3, x28
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x21, x0
; CHECK-SD-NEXT:    mov x22, x1
; CHECK-SD-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
; CHECK-SD-NEXT:    ldp x0, x1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT:    mov x3, x26
; CHECK-SD-NEXT:    bl __umodti3
; CHECK-SD-NEXT:    mov x6, x0
; CHECK-SD-NEXT:    mov x7, x1
; CHECK-SD-NEXT:    mov x0, x20
; CHECK-SD-NEXT:    mov x1, x25
; CHECK-SD-NEXT:    mov x2, x19
; CHECK-SD-NEXT:    mov x3, x23
; CHECK-SD-NEXT:    mov x4, x21
; CHECK-SD-NEXT:    mov x5, x22
; CHECK-SD-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT:    add sp, sp, #128
; CHECK-SD-NEXT:    ret
;
; CHECK-GI-LABEL: uv4i128:
; CHECK-GI:       // %bb.0: // %entry
; CHECK-GI-NEXT:    sub sp, sp, #128
; CHECK-GI-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x28, x27, [sp, #48] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x26, x25, [sp, #64] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x24, x23, [sp, #80] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x22, x21, [sp, #96] // 16-byte Folded Spill
; CHECK-GI-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
; CHECK-GI-NEXT:    .cfi_def_cfa_offset 128
; CHECK-GI-NEXT:    .cfi_offset w19, -8
; CHECK-GI-NEXT:    .cfi_offset w20, -16
; CHECK-GI-NEXT:    .cfi_offset w21, -24
; CHECK-GI-NEXT:    .cfi_offset w22, -32
; CHECK-GI-NEXT:    .cfi_offset w23, -40
; CHECK-GI-NEXT:    .cfi_offset w24, -48
; CHECK-GI-NEXT:    .cfi_offset w25, -56
; CHECK-GI-NEXT:    .cfi_offset w26, -64
; CHECK-GI-NEXT:    .cfi_offset w27, -72
; CHECK-GI-NEXT:    .cfi_offset w28, -80
; CHECK-GI-NEXT:    .cfi_offset w30, -88
; CHECK-GI-NEXT:    .cfi_offset w29, -96
; CHECK-GI-NEXT:    mov x19, x2
; CHECK-GI-NEXT:    mov x20, x3
; CHECK-GI-NEXT:    mov x21, x4
; CHECK-GI-NEXT:    ldp x2, x3, [sp, #128]
; CHECK-GI-NEXT:    mov x22, x5
; CHECK-GI-NEXT:    ldp x9, x8, [sp, #176]
; CHECK-GI-NEXT:    mov x23, x7
; CHECK-GI-NEXT:    ldp x24, x25, [sp, #144]
; CHECK-GI-NEXT:    ldp x26, x27, [sp, #160]
; CHECK-GI-NEXT:    stp x9, x6, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT:    str x8, [sp, #8] // 8-byte Folded Spill
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x28, x0
; CHECK-GI-NEXT:    mov x29, x1
; CHECK-GI-NEXT:    mov x0, x19
; CHECK-GI-NEXT:    mov x1, x20
; CHECK-GI-NEXT:    mov x2, x24
; CHECK-GI-NEXT:    mov x3, x25
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x19, x0
; CHECK-GI-NEXT:    mov x20, x1
; CHECK-GI-NEXT:    mov x0, x21
; CHECK-GI-NEXT:    mov x1, x22
; CHECK-GI-NEXT:    mov x2, x26
; CHECK-GI-NEXT:    mov x3, x27
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x21, x0
; CHECK-GI-NEXT:    ldp x2, x0, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
; CHECK-GI-NEXT:    mov x22, x1
; CHECK-GI-NEXT:    mov x1, x23
; CHECK-GI-NEXT:    bl __umodti3
; CHECK-GI-NEXT:    mov x6, x0
; CHECK-GI-NEXT:    mov x7, x1
; CHECK-GI-NEXT:    mov x0, x28
; CHECK-GI-NEXT:    mov x1, x29
; CHECK-GI-NEXT:    mov x2, x19
; CHECK-GI-NEXT:    mov x3, x20
; CHECK-GI-NEXT:    mov x4, x21
; CHECK-GI-NEXT:    mov x5, x22
; CHECK-GI-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x28, x27, [sp, #48] // 16-byte Folded Reload
; CHECK-GI-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT:    add sp, sp, #128
; CHECK-GI-NEXT:    ret
entry:
  %s = urem <4 x i128> %d, %e
  ret <4 x i128> %s
}