llvm/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31

define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: lshr_4bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: lshr_4bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    shrxl %eax, (%rdi), %eax
; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: lshr_4bytes:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-NO-BMI2-NEXT:    shlb $3, %cl
; X86-NO-BMI2-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-NO-BMI2-NEXT:    retl
;
; X86-HAVE-BMI2-LABEL: lshr_4bytes:
; X86-HAVE-BMI2:       # %bb.0:
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
; X86-HAVE-BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
; X86-HAVE-BMI2-NEXT:    retl
  %src = load i32, ptr %src.ptr, align 1
  %byteOff = load i32, ptr %byteOff.ptr, align 1
  %bitOff = shl i32 %byteOff, 3
  %res = lshr i32 %src, %bitOff
  store i32 %res, ptr %dst, align 1
  ret void
}
define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: shl_4bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    shll %cl, %eax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: shl_4bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    shlxl %eax, (%rdi), %eax
; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: shl_4bytes:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-NO-BMI2-NEXT:    shlb $3, %cl
; X86-NO-BMI2-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-NO-BMI2-NEXT:    retl
;
; X86-HAVE-BMI2-LABEL: shl_4bytes:
; X86-HAVE-BMI2:       # %bb.0:
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
; X86-HAVE-BMI2-NEXT:    shlxl %edx, (%ecx), %ecx
; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
; X86-HAVE-BMI2-NEXT:    retl
  %src = load i32, ptr %src.ptr, align 1
  %byteOff = load i32, ptr %byteOff.ptr, align 1
  %bitOff = shl i32 %byteOff, 3
  %res = shl i32 %src, %bitOff
  store i32 %res, ptr %dst, align 1
  ret void
}
define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: ashr_4bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    sarl %cl, %eax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: ashr_4bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    sarxl %eax, (%rdi), %eax
; X64-HAVE-BMI2-NEXT:    movl %eax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: ashr_4bytes:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-NO-BMI2-NEXT:    shlb $3, %cl
; X86-NO-BMI2-NEXT:    sarl %cl, %edx
; X86-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-NO-BMI2-NEXT:    retl
;
; X86-HAVE-BMI2-LABEL: ashr_4bytes:
; X86-HAVE-BMI2:       # %bb.0:
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
; X86-HAVE-BMI2-NEXT:    shlb $3, %dl
; X86-HAVE-BMI2-NEXT:    sarxl %edx, (%ecx), %ecx
; X86-HAVE-BMI2-NEXT:    movl %ecx, (%eax)
; X86-HAVE-BMI2-NEXT:    retl
  %src = load i32, ptr %src.ptr, align 1
  %byteOff = load i32, ptr %byteOff.ptr, align 1
  %bitOff = shl i32 %byteOff, 3
  %res = ashr i32 %src, %bitOff
  store i32 %res, ptr %dst, align 1
  ret void
}

define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: lshr_8bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: lshr_8bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    shrxq %rax, (%rdi), %rax
; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-SHLD-NO-BMI2-LABEL: lshr_8bytes:
; X86-NO-SHLD-NO-BMI2:       # %bb.0:
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, 4(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-NO-BMI2-LABEL: lshr_8bytes:
; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
;
; X86-NO-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, (%edx), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%esi,%esi), %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edx, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edx, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxl %ecx, %esi, %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
  %src = load i64, ptr %src.ptr, align 1
  %byteOff = load i64, ptr %byteOff.ptr, align 1
  %bitOff = shl i64 %byteOff, 3
  %res = lshr i64 %src, %bitOff
  store i64 %res, ptr %dst, align 1
  ret void
}
define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: shl_8bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    shlq %cl, %rax
; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: shl_8bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    shlxq %rax, (%rdi), %rax
; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-SHLD-NO-BMI2-LABEL: shl_8bytes:
; X86-NO-SHLD-NO-BMI2:       # %bb.0:
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %esi, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %esi, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, (%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-NO-BMI2-LABEL: shl_8bytes:
; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shldl %cl, %esi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    xorl %esi, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, 4(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
;
; X86-NO-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, 4(%edx), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %ebx, %esi, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %edx, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    xorl %edx, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shldl %cl, %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxl %ecx, %esi, %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, 4(%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
  %src = load i64, ptr %src.ptr, align 1
  %byteOff = load i64, ptr %byteOff.ptr, align 1
  %bitOff = shl i64 %byteOff, 3
  %res = shl i64 %src, %bitOff
  store i64 %res, ptr %dst, align 1
  ret void
}
define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: ashr_8bytes:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-BMI2-NEXT:    shlb $3, %cl
; X64-NO-BMI2-NEXT:    sarq %cl, %rax
; X64-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-HAVE-BMI2-LABEL: ashr_8bytes:
; X64-HAVE-BMI2:       # %bb.0:
; X64-HAVE-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-HAVE-BMI2-NEXT:    shlb $3, %al
; X64-HAVE-BMI2-NEXT:    sarxq %rax, (%rdi), %rax
; X64-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-BMI2-NEXT:    retq
;
; X86-NO-SHLD-NO-BMI2-LABEL: ashr_8bytes:
; X86-NO-SHLD-NO-BMI2:       # %bb.0:
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl (%eax), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%esi,%esi), %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    testb $32, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovnel %ebx, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    cmovel %ebx, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, (%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_8bytes:
; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esi), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esi), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovnel %edi, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    cmovel %edi, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 4(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
;
; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%edx), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %dl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %edx, (%esi), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %bl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ecx,%ecx), %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %ebx, %edi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %esi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %edx, %ecx, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    testb $32, %dl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovnel %esi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    cmovel %esi, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%eax)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esi), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esi), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %esi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $32, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovnel %edi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovel %edi, %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, 4(%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%eax)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
  %src = load i64, ptr %src.ptr, align 1
  %byteOff = load i64, ptr %byteOff.ptr, align 1
  %bitOff = shl i64 %byteOff, 3
  %res = ashr i64 %src, %bitOff
  store i64 %res, ptr %dst, align 1
  ret void
}

define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %rdi, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %rdi, %rcx
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %edi, %edi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rdi, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; FALLBACK16-LABEL: lshr_16bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $60, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    movl (%ecx), %edx
; FALLBACK16-NEXT:    movl 4(%ecx), %esi
; FALLBACK16-NEXT:    movl 8(%ecx), %edi
; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
; FALLBACK16-NEXT:    movb (%eax), %ah
; FALLBACK16-NEXT:    movb %ah, %al
; FALLBACK16-NEXT:    shlb $3, %al
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    andb $12, %ah
; FALLBACK16-NEXT:    movzbl %ah, %ebp
; FALLBACK16-NEXT:    movl 20(%esp,%ebp), %esi
; FALLBACK16-NEXT:    movl %esi, %ebx
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl %eax, %edx
; FALLBACK16-NEXT:    notb %dl
; FALLBACK16-NEXT:    movl 24(%esp,%ebp), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %ebx, %edi
; FALLBACK16-NEXT:    movl 16(%esp,%ebp), %ebx
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    addl %esi, %esi
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    orl %ebx, %esi
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; FALLBACK16-NEXT:    movl 28(%esp,%ebp), %ebx
; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl %ebx, 12(%edx)
; FALLBACK16-NEXT:    movl %ebp, 8(%edx)
; FALLBACK16-NEXT:    movl %esi, (%edx)
; FALLBACK16-NEXT:    movl %edi, 4(%edx)
; FALLBACK16-NEXT:    addl $60, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: lshr_16bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $44, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK17-NEXT:    movl (%edx), %esi
; FALLBACK17-NEXT:    movl 4(%edx), %edi
; FALLBACK17-NEXT:    movl 8(%edx), %ebx
; FALLBACK17-NEXT:    movl 12(%edx), %edx
; FALLBACK17-NEXT:    movb (%ecx), %ch
; FALLBACK17-NEXT:    movb %ch, %cl
; FALLBACK17-NEXT:    shlb $3, %cl
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, (%esp)
; FALLBACK17-NEXT:    andb $12, %ch
; FALLBACK17-NEXT:    movzbl %ch, %ebx
; FALLBACK17-NEXT:    movl 8(%esp,%ebx), %esi
; FALLBACK17-NEXT:    movl (%esp,%ebx), %edx
; FALLBACK17-NEXT:    movl 4(%esp,%ebx), %ebp
; FALLBACK17-NEXT:    movl %ebp, %edi
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK17-NEXT:    movl 12(%esp,%ebx), %ebx
; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
; FALLBACK17-NEXT:    shrdl %cl, %ebp, %edx
; FALLBACK17-NEXT:    shrl %cl, %ebx
; FALLBACK17-NEXT:    movl %esi, 8(%eax)
; FALLBACK17-NEXT:    movl %ebx, 12(%eax)
; FALLBACK17-NEXT:    movl %edx, (%eax)
; FALLBACK17-NEXT:    movl %edi, 4(%eax)
; FALLBACK17-NEXT:    addl $44, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: lshr_16bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $44, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK18-NEXT:    movl (%ecx), %edx
; FALLBACK18-NEXT:    movl 4(%ecx), %esi
; FALLBACK18-NEXT:    movl 8(%ecx), %edi
; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
; FALLBACK18-NEXT:    movzbl (%eax), %ebx
; FALLBACK18-NEXT:    movl %ebx, %eax
; FALLBACK18-NEXT:    shlb $3, %al
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, (%esp)
; FALLBACK18-NEXT:    andb $12, %bl
; FALLBACK18-NEXT:    movzbl %bl, %esi
; FALLBACK18-NEXT:    movl 4(%esp,%esi), %edi
; FALLBACK18-NEXT:    movl 8(%esp,%esi), %ebx
; FALLBACK18-NEXT:    shrxl %eax, %edi, %ebp
; FALLBACK18-NEXT:    movl %eax, %edx
; FALLBACK18-NEXT:    notb %dl
; FALLBACK18-NEXT:    leal (%ebx,%ebx), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK18-NEXT:    orl %ebp, %ecx
; FALLBACK18-NEXT:    shrxl %eax, (%esp,%esi), %ebp
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
; FALLBACK18-NEXT:    orl %ebp, %edi
; FALLBACK18-NEXT:    shrxl %eax, %ebx, %ebx
; FALLBACK18-NEXT:    movl 12(%esp,%esi), %esi
; FALLBACK18-NEXT:    shrxl %eax, %esi, %eax
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %edx
; FALLBACK18-NEXT:    orl %ebx, %edx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK18-NEXT:    movl %eax, 12(%esi)
; FALLBACK18-NEXT:    movl %edx, 8(%esi)
; FALLBACK18-NEXT:    movl %edi, (%esi)
; FALLBACK18-NEXT:    movl %ecx, 4(%esi)
; FALLBACK18-NEXT:    addl $44, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: lshr_16bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $44, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK19-NEXT:    movl (%edx), %esi
; FALLBACK19-NEXT:    movl 4(%edx), %edi
; FALLBACK19-NEXT:    movl 8(%edx), %ebx
; FALLBACK19-NEXT:    movl 12(%edx), %edx
; FALLBACK19-NEXT:    movzbl (%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, %ecx
; FALLBACK19-NEXT:    shlb $3, %cl
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, (%esp)
; FALLBACK19-NEXT:    andb $12, %al
; FALLBACK19-NEXT:    movzbl %al, %eax
; FALLBACK19-NEXT:    movl 8(%esp,%eax), %ebx
; FALLBACK19-NEXT:    movl (%esp,%eax), %edx
; FALLBACK19-NEXT:    movl 4(%esp,%eax), %esi
; FALLBACK19-NEXT:    movl %esi, %edi
; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edi
; FALLBACK19-NEXT:    movl 12(%esp,%eax), %eax
; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK19-NEXT:    movl %ebx, 8(%ebp)
; FALLBACK19-NEXT:    shrxl %ecx, %eax, %eax
; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, (%ebp)
; FALLBACK19-NEXT:    movl %edi, 4(%ebp)
; FALLBACK19-NEXT:    addl $44, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: lshr_16bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $60, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movzbl (%eax), %ecx
; FALLBACK20-NEXT:    movl %ecx, %eax
; FALLBACK20-NEXT:    shlb $3, %al
; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    andb $12, %cl
; FALLBACK20-NEXT:    movzbl %cl, %edi
; FALLBACK20-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl 20(%esp,%edi), %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    notb %dl
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 24(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %esi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    movl 28(%esp,%edi), %edi
; FALLBACK20-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %esi, %ebp
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %esi, %ebx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    movl %edi, 12(%edx)
; FALLBACK20-NEXT:    movl %ebx, 4(%edx)
; FALLBACK20-NEXT:    movl %ebp, 8(%edx)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl %eax, (%edx)
; FALLBACK20-NEXT:    addl $60, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: lshr_16bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $44, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK21-NEXT:    movups (%edx), %xmm0
; FALLBACK21-NEXT:    movzbl (%ecx), %edx
; FALLBACK21-NEXT:    movl %edx, %ecx
; FALLBACK21-NEXT:    shlb $3, %cl
; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, (%esp)
; FALLBACK21-NEXT:    andb $12, %dl
; FALLBACK21-NEXT:    movzbl %dl, %ebx
; FALLBACK21-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK21-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK21-NEXT:    movl %ebp, %edi
; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK21-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK21-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK21-NEXT:    movl %eax, %ebx
; FALLBACK21-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK21-NEXT:    movl %edi, 8(%ebp)
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    shrl %cl, %edx
; FALLBACK21-NEXT:    movl %edx, 12(%ebp)
; FALLBACK21-NEXT:    movl %esi, (%ebp)
; FALLBACK21-NEXT:    addl $44, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: lshr_16bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $44, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movzbl (%eax), %ecx
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    shlb $3, %al
; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, (%esp)
; FALLBACK22-NEXT:    andb $12, %cl
; FALLBACK22-NEXT:    movzbl %cl, %edi
; FALLBACK22-NEXT:    shrxl %eax, (%esp,%edi), %ebx
; FALLBACK22-NEXT:    movl %eax, %ecx
; FALLBACK22-NEXT:    notb %cl
; FALLBACK22-NEXT:    movl 4(%esp,%edi), %ebp
; FALLBACK22-NEXT:    movl 8(%esp,%edi), %esi
; FALLBACK22-NEXT:    leal (%ebp,%ebp), %edx
; FALLBACK22-NEXT:    shlxl %ecx, %edx, %edx
; FALLBACK22-NEXT:    orl %ebx, %edx
; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebx
; FALLBACK22-NEXT:    shrxl %eax, %ebp, %ebp
; FALLBACK22-NEXT:    movl 12(%esp,%edi), %edi
; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK22-NEXT:    orl %ebx, %edi
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %ecx, %esi, %ecx
; FALLBACK22-NEXT:    orl %ebp, %ecx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK22-NEXT:    movl %eax, 12(%esi)
; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
; FALLBACK22-NEXT:    movl %edi, 8(%esi)
; FALLBACK22-NEXT:    movl %edx, (%esi)
; FALLBACK22-NEXT:    addl $44, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: lshr_16bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $44, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK23-NEXT:    movups (%edx), %xmm0
; FALLBACK23-NEXT:    movzbl (%ecx), %edx
; FALLBACK23-NEXT:    movl %edx, %ecx
; FALLBACK23-NEXT:    shlb $3, %cl
; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, (%esp)
; FALLBACK23-NEXT:    andb $12, %dl
; FALLBACK23-NEXT:    movzbl %dl, %ebx
; FALLBACK23-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK23-NEXT:    movl %ebp, %edi
; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK23-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, %ebx
; FALLBACK23-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK23-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK23-NEXT:    movl %edi, 8(%ebp)
; FALLBACK23-NEXT:    shrxl %ecx, %edx, %edx
; FALLBACK23-NEXT:    movl %edx, 12(%ebp)
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK23-NEXT:    movl %esi, (%ebp)
; FALLBACK23-NEXT:    addl $44, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: lshr_16bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $60, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK24-NEXT:    movzbl (%eax), %ecx
; FALLBACK24-NEXT:    movl %ecx, %eax
; FALLBACK24-NEXT:    shlb $3, %al
; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    andb $12, %cl
; FALLBACK24-NEXT:    movzbl %cl, %edi
; FALLBACK24-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl 20(%esp,%edi), %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    notb %dl
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 24(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %esi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    movl 28(%esp,%edi), %edi
; FALLBACK24-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %esi, %ebp
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %esi, %ebx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    movl %edi, 12(%edx)
; FALLBACK24-NEXT:    movl %ebx, 4(%edx)
; FALLBACK24-NEXT:    movl %ebp, 8(%edx)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl %eax, (%edx)
; FALLBACK24-NEXT:    addl $60, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: lshr_16bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $44, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
; FALLBACK25-NEXT:    movzbl (%ecx), %edx
; FALLBACK25-NEXT:    movl %edx, %ecx
; FALLBACK25-NEXT:    shlb $3, %cl
; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK25-NEXT:    andb $12, %dl
; FALLBACK25-NEXT:    movzbl %dl, %ebx
; FALLBACK25-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK25-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK25-NEXT:    movl %ebp, %edi
; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK25-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK25-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK25-NEXT:    movl %eax, %ebx
; FALLBACK25-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK25-NEXT:    movl %edi, 8(%ebp)
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    shrl %cl, %edx
; FALLBACK25-NEXT:    movl %edx, 12(%ebp)
; FALLBACK25-NEXT:    movl %esi, (%ebp)
; FALLBACK25-NEXT:    addl $44, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: lshr_16bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $44, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK26-NEXT:    movzbl (%eax), %ecx
; FALLBACK26-NEXT:    movl %ecx, %eax
; FALLBACK26-NEXT:    shlb $3, %al
; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK26-NEXT:    andb $12, %cl
; FALLBACK26-NEXT:    movzbl %cl, %edi
; FALLBACK26-NEXT:    shrxl %eax, (%esp,%edi), %ebx
; FALLBACK26-NEXT:    movl %eax, %ecx
; FALLBACK26-NEXT:    notb %cl
; FALLBACK26-NEXT:    movl 4(%esp,%edi), %ebp
; FALLBACK26-NEXT:    movl 8(%esp,%edi), %esi
; FALLBACK26-NEXT:    leal (%ebp,%ebp), %edx
; FALLBACK26-NEXT:    shlxl %ecx, %edx, %edx
; FALLBACK26-NEXT:    orl %ebx, %edx
; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebx
; FALLBACK26-NEXT:    shrxl %eax, %ebp, %ebp
; FALLBACK26-NEXT:    movl 12(%esp,%edi), %edi
; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK26-NEXT:    orl %ebx, %edi
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %ecx, %esi, %ecx
; FALLBACK26-NEXT:    orl %ebp, %ecx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK26-NEXT:    movl %eax, 12(%esi)
; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
; FALLBACK26-NEXT:    movl %edi, 8(%esi)
; FALLBACK26-NEXT:    movl %edx, (%esi)
; FALLBACK26-NEXT:    addl $44, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: lshr_16bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $44, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
; FALLBACK27-NEXT:    movzbl (%ecx), %edx
; FALLBACK27-NEXT:    movl %edx, %ecx
; FALLBACK27-NEXT:    shlb $3, %cl
; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK27-NEXT:    andb $12, %dl
; FALLBACK27-NEXT:    movzbl %dl, %ebx
; FALLBACK27-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK27-NEXT:    movl %ebp, %edi
; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK27-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, %ebx
; FALLBACK27-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK27-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK27-NEXT:    movl %edi, 8(%ebp)
; FALLBACK27-NEXT:    shrxl %ecx, %edx, %edx
; FALLBACK27-NEXT:    movl %edx, 12(%ebp)
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK27-NEXT:    movl %esi, (%ebp)
; FALLBACK27-NEXT:    addl $44, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: lshr_16bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $60, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK28-NEXT:    movzbl (%eax), %ecx
; FALLBACK28-NEXT:    movl %ecx, %eax
; FALLBACK28-NEXT:    shlb $3, %al
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    andb $12, %cl
; FALLBACK28-NEXT:    movzbl %cl, %edi
; FALLBACK28-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl 20(%esp,%edi), %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    notb %dl
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 24(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %esi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    movl 28(%esp,%edi), %edi
; FALLBACK28-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %esi, %ebp
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %esi, %ebx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    movl %edi, 12(%edx)
; FALLBACK28-NEXT:    movl %ebx, 4(%edx)
; FALLBACK28-NEXT:    movl %ebp, 8(%edx)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl %eax, (%edx)
; FALLBACK28-NEXT:    addl $60, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: lshr_16bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $44, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
; FALLBACK29-NEXT:    movzbl (%ecx), %edx
; FALLBACK29-NEXT:    movl %edx, %ecx
; FALLBACK29-NEXT:    shlb $3, %cl
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK29-NEXT:    andb $12, %dl
; FALLBACK29-NEXT:    movzbl %dl, %ebx
; FALLBACK29-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK29-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK29-NEXT:    movl %ebp, %edi
; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK29-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK29-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK29-NEXT:    movl %eax, %ebx
; FALLBACK29-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK29-NEXT:    movl %edi, 8(%ebp)
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    shrl %cl, %edx
; FALLBACK29-NEXT:    movl %edx, 12(%ebp)
; FALLBACK29-NEXT:    movl %esi, (%ebp)
; FALLBACK29-NEXT:    addl $44, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: lshr_16bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $44, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK30-NEXT:    movzbl (%eax), %ecx
; FALLBACK30-NEXT:    movl %ecx, %eax
; FALLBACK30-NEXT:    shlb $3, %al
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK30-NEXT:    andb $12, %cl
; FALLBACK30-NEXT:    movzbl %cl, %edi
; FALLBACK30-NEXT:    shrxl %eax, (%esp,%edi), %ebx
; FALLBACK30-NEXT:    movl %eax, %ecx
; FALLBACK30-NEXT:    notb %cl
; FALLBACK30-NEXT:    movl 4(%esp,%edi), %ebp
; FALLBACK30-NEXT:    movl 8(%esp,%edi), %esi
; FALLBACK30-NEXT:    leal (%ebp,%ebp), %edx
; FALLBACK30-NEXT:    shlxl %ecx, %edx, %edx
; FALLBACK30-NEXT:    orl %ebx, %edx
; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebx
; FALLBACK30-NEXT:    shrxl %eax, %ebp, %ebp
; FALLBACK30-NEXT:    movl 12(%esp,%edi), %edi
; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK30-NEXT:    orl %ebx, %edi
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %ecx, %esi, %ecx
; FALLBACK30-NEXT:    orl %ebp, %ecx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK30-NEXT:    movl %eax, 12(%esi)
; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
; FALLBACK30-NEXT:    movl %edi, 8(%esi)
; FALLBACK30-NEXT:    movl %edx, (%esi)
; FALLBACK30-NEXT:    addl $44, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: lshr_16bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $44, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
; FALLBACK31-NEXT:    movzbl (%ecx), %edx
; FALLBACK31-NEXT:    movl %edx, %ecx
; FALLBACK31-NEXT:    shlb $3, %cl
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovaps %xmm0, (%esp)
; FALLBACK31-NEXT:    andb $12, %dl
; FALLBACK31-NEXT:    movzbl %dl, %ebx
; FALLBACK31-NEXT:    movl 12(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl 8(%esp,%ebx), %ebp
; FALLBACK31-NEXT:    movl %ebp, %edi
; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl (%esp,%ebx), %esi
; FALLBACK31-NEXT:    movl 4(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, %ebx
; FALLBACK31-NEXT:    shrdl %cl, %ebp, %ebx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK31-NEXT:    movl %ebx, 4(%ebp)
; FALLBACK31-NEXT:    movl %edi, 8(%ebp)
; FALLBACK31-NEXT:    shrxl %ecx, %edx, %edx
; FALLBACK31-NEXT:    movl %edx, 12(%ebp)
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK31-NEXT:    movl %esi, (%ebp)
; FALLBACK31-NEXT:    addl $44, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %byteOff = load i128, ptr %byteOff.ptr, align 1
  %bitOff = shl i128 %byteOff, 3
  %res = lshr i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %rdi, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %rdi, %rcx
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %edi, %edi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, %rdi, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %edi, %edi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; X86-SSE2-LABEL: lshr_16bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $32, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT:    movl (%edx), %esi
; X86-SSE2-NEXT:    movl 4(%edx), %edi
; X86-SSE2-NEXT:    movl 8(%edx), %ebx
; X86-SSE2-NEXT:    movl 12(%edx), %edx
; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, (%esp)
; X86-SSE2-NEXT:    andl $3, %ecx
; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl %edi, 12(%eax)
; X86-SSE2-NEXT:    movl %edx, (%eax)
; X86-SSE2-NEXT:    movl %esi, 4(%eax)
; X86-SSE2-NEXT:    addl $32, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: lshr_16bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $44, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    andl $3, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $44, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: lshr_16bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $44, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %xmm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
; X86-AVX-NEXT:    andl $3, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $44, %esp
; X86-AVX-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
  %bitOff = shl i128 %dwordOff, 5
  %res = lshr i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %rdi, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rcx
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shldq %cl, %rax, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %eax, %eax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, 8(%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrq %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rdi, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %r8, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %r8, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shldq %cl, %rax, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; FALLBACK16-LABEL: shl_16bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $60, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    movl (%ecx), %ebx
; FALLBACK16-NEXT:    movl 4(%ecx), %esi
; FALLBACK16-NEXT:    movl 8(%ecx), %edi
; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
; FALLBACK16-NEXT:    movb (%eax), %ah
; FALLBACK16-NEXT:    movb %ah, %dh
; FALLBACK16-NEXT:    shlb $3, %dh
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    andb $12, %ah
; FALLBACK16-NEXT:    negb %ah
; FALLBACK16-NEXT:    movsbl %ah, %ebp
; FALLBACK16-NEXT:    movl 32(%esp,%ebp), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 36(%esp,%ebp), %esi
; FALLBACK16-NEXT:    movl %esi, %edi
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    movb %dh, %dl
; FALLBACK16-NEXT:    notb %dl
; FALLBACK16-NEXT:    shrl %ebx
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    movl 40(%esp,%ebp), %edi
; FALLBACK16-NEXT:    movl %edi, %ebp
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    shrl %esi
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    orl %edi, %esi
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    movl %edx, (%eax)
; FALLBACK16-NEXT:    movl %esi, 8(%eax)
; FALLBACK16-NEXT:    movl %ebp, 12(%eax)
; FALLBACK16-NEXT:    movl %ebx, 4(%eax)
; FALLBACK16-NEXT:    addl $60, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: shl_16bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $32, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK17-NEXT:    movl (%edx), %esi
; FALLBACK17-NEXT:    movl 4(%edx), %edi
; FALLBACK17-NEXT:    movl 8(%edx), %ebx
; FALLBACK17-NEXT:    movl 12(%edx), %edx
; FALLBACK17-NEXT:    movb (%ecx), %ch
; FALLBACK17-NEXT:    movb %ch, %cl
; FALLBACK17-NEXT:    shlb $3, %cl
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, (%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    andb $12, %ch
; FALLBACK17-NEXT:    negb %ch
; FALLBACK17-NEXT:    movsbl %ch, %edi
; FALLBACK17-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK17-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK17-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
; FALLBACK17-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK17-NEXT:    shll %cl, %ebx
; FALLBACK17-NEXT:    movl %esi, 8(%eax)
; FALLBACK17-NEXT:    movl %edx, 12(%eax)
; FALLBACK17-NEXT:    movl %ebx, (%eax)
; FALLBACK17-NEXT:    movl %edi, 4(%eax)
; FALLBACK17-NEXT:    addl $32, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: shl_16bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $44, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK18-NEXT:    movl (%ecx), %edx
; FALLBACK18-NEXT:    movl 4(%ecx), %esi
; FALLBACK18-NEXT:    movl 8(%ecx), %edi
; FALLBACK18-NEXT:    movl 12(%ecx), %ecx
; FALLBACK18-NEXT:    movzbl (%eax), %eax
; FALLBACK18-NEXT:    movl %eax, %ebx
; FALLBACK18-NEXT:    shlb $3, %bl
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, (%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    andb $12, %al
; FALLBACK18-NEXT:    negb %al
; FALLBACK18-NEXT:    movsbl %al, %edx
; FALLBACK18-NEXT:    movl 16(%esp,%edx), %edi
; FALLBACK18-NEXT:    movl 20(%esp,%edx), %ecx
; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %esi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %ebp
; FALLBACK18-NEXT:    movl %ebx, %eax
; FALLBACK18-NEXT:    notb %al
; FALLBACK18-NEXT:    shrl %edi
; FALLBACK18-NEXT:    shrxl %eax, %edi, %edi
; FALLBACK18-NEXT:    orl %esi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, 28(%esp,%edx), %esi
; FALLBACK18-NEXT:    movl 24(%esp,%edx), %edx
; FALLBACK18-NEXT:    shlxl %ebx, %edx, %ebx
; FALLBACK18-NEXT:    shrl %edx
; FALLBACK18-NEXT:    shrxl %eax, %edx, %edx
; FALLBACK18-NEXT:    orl %esi, %edx
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %eax, %ecx, %eax
; FALLBACK18-NEXT:    orl %ebx, %eax
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK18-NEXT:    movl %ebp, (%ecx)
; FALLBACK18-NEXT:    movl %eax, 8(%ecx)
; FALLBACK18-NEXT:    movl %edx, 12(%ecx)
; FALLBACK18-NEXT:    movl %edi, 4(%ecx)
; FALLBACK18-NEXT:    addl $44, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: shl_16bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $44, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK19-NEXT:    movl (%edx), %esi
; FALLBACK19-NEXT:    movl 4(%edx), %edi
; FALLBACK19-NEXT:    movl 8(%edx), %ebx
; FALLBACK19-NEXT:    movl 12(%edx), %edx
; FALLBACK19-NEXT:    movzbl (%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, %ecx
; FALLBACK19-NEXT:    shlb $3, %cl
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, (%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    andb $12, %al
; FALLBACK19-NEXT:    negb %al
; FALLBACK19-NEXT:    movsbl %al, %eax
; FALLBACK19-NEXT:    movl 24(%esp,%eax), %esi
; FALLBACK19-NEXT:    movl 28(%esp,%eax), %edx
; FALLBACK19-NEXT:    shldl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl 16(%esp,%eax), %edi
; FALLBACK19-NEXT:    movl 20(%esp,%eax), %eax
; FALLBACK19-NEXT:    shldl %cl, %eax, %esi
; FALLBACK19-NEXT:    shldl %cl, %edi, %eax
; FALLBACK19-NEXT:    shlxl %ecx, %edi, %ecx
; FALLBACK19-NEXT:    movl %esi, 8(%ebp)
; FALLBACK19-NEXT:    movl %edx, 12(%ebp)
; FALLBACK19-NEXT:    movl %ecx, (%ebp)
; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
; FALLBACK19-NEXT:    addl $44, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: shl_16bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $60, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movzbl (%eax), %ecx
; FALLBACK20-NEXT:    movl %ecx, %eax
; FALLBACK20-NEXT:    shlb $3, %al
; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    andb $12, %cl
; FALLBACK20-NEXT:    negb %cl
; FALLBACK20-NEXT:    movsbl %cl, %edi
; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    notb %dl
; FALLBACK20-NEXT:    movl 40(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %esi
; FALLBACK20-NEXT:    shrl %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    movl 32(%esp,%edi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 36(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %edi
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %ebp, %edi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK20-NEXT:    shrl %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    orl %ebx, %ebp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    movl %eax, (%edx)
; FALLBACK20-NEXT:    movl %ebp, 4(%edx)
; FALLBACK20-NEXT:    movl %edi, 8(%edx)
; FALLBACK20-NEXT:    movl %esi, 12(%edx)
; FALLBACK20-NEXT:    addl $60, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: shl_16bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $44, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK21-NEXT:    movups (%edx), %xmm0
; FALLBACK21-NEXT:    movzbl (%ecx), %edx
; FALLBACK21-NEXT:    movl %edx, %ecx
; FALLBACK21-NEXT:    shlb $3, %cl
; FALLBACK21-NEXT:    xorps %xmm1, %xmm1
; FALLBACK21-NEXT:    movaps %xmm1, (%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    andb $12, %dl
; FALLBACK21-NEXT:    negb %dl
; FALLBACK21-NEXT:    movsbl %dl, %edi
; FALLBACK21-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK21-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK21-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
; FALLBACK21-NEXT:    movl %ebx, %ebp
; FALLBACK21-NEXT:    shll %cl, %ebp
; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK21-NEXT:    movl %edi, 4(%eax)
; FALLBACK21-NEXT:    movl %esi, 8(%eax)
; FALLBACK21-NEXT:    movl %edx, 12(%eax)
; FALLBACK21-NEXT:    movl %ebp, (%eax)
; FALLBACK21-NEXT:    addl $44, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: shl_16bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $44, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movzbl (%eax), %ecx
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    shlb $3, %al
; FALLBACK22-NEXT:    xorps %xmm1, %xmm1
; FALLBACK22-NEXT:    movaps %xmm1, (%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    andb $12, %cl
; FALLBACK22-NEXT:    negb %cl
; FALLBACK22-NEXT:    movsbl %cl, %ecx
; FALLBACK22-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
; FALLBACK22-NEXT:    movl 24(%esp,%ecx), %edx
; FALLBACK22-NEXT:    shlxl %eax, %edx, %edi
; FALLBACK22-NEXT:    movl %eax, %ebx
; FALLBACK22-NEXT:    notb %bl
; FALLBACK22-NEXT:    shrl %edx
; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK22-NEXT:    orl %esi, %edx
; FALLBACK22-NEXT:    movl 20(%esp,%ecx), %esi
; FALLBACK22-NEXT:    movl %esi, %ebp
; FALLBACK22-NEXT:    shrl %ebp
; FALLBACK22-NEXT:    shrxl %ebx, %ebp, %ebp
; FALLBACK22-NEXT:    orl %edi, %ebp
; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK22-NEXT:    movl 16(%esp,%ecx), %ecx
; FALLBACK22-NEXT:    shlxl %eax, %ecx, %eax
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %esi, %ecx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK22-NEXT:    movl %eax, (%esi)
; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
; FALLBACK22-NEXT:    movl %edx, 12(%esi)
; FALLBACK22-NEXT:    addl $44, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: shl_16bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $44, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK23-NEXT:    movups (%edx), %xmm0
; FALLBACK23-NEXT:    movzbl (%ecx), %edx
; FALLBACK23-NEXT:    movl %edx, %ecx
; FALLBACK23-NEXT:    shlb $3, %cl
; FALLBACK23-NEXT:    xorps %xmm1, %xmm1
; FALLBACK23-NEXT:    movaps %xmm1, (%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    andb $12, %dl
; FALLBACK23-NEXT:    negb %dl
; FALLBACK23-NEXT:    movsbl %dl, %edi
; FALLBACK23-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK23-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK23-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK23-NEXT:    shldl %cl, %edi, %esi
; FALLBACK23-NEXT:    shlxl %ecx, %ebx, %ebp
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK23-NEXT:    movl %edi, 4(%eax)
; FALLBACK23-NEXT:    movl %esi, 8(%eax)
; FALLBACK23-NEXT:    movl %edx, 12(%eax)
; FALLBACK23-NEXT:    movl %ebp, (%eax)
; FALLBACK23-NEXT:    addl $44, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: shl_16bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $60, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK24-NEXT:    movzbl (%eax), %ecx
; FALLBACK24-NEXT:    movl %ecx, %eax
; FALLBACK24-NEXT:    shlb $3, %al
; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    andb $12, %cl
; FALLBACK24-NEXT:    negb %cl
; FALLBACK24-NEXT:    movsbl %cl, %edi
; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    notb %dl
; FALLBACK24-NEXT:    movl 40(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %esi
; FALLBACK24-NEXT:    shrl %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    movl 32(%esp,%edi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 36(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %edi
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %ebp, %edi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK24-NEXT:    shrl %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    orl %ebx, %ebp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    movl %eax, (%edx)
; FALLBACK24-NEXT:    movl %ebp, 4(%edx)
; FALLBACK24-NEXT:    movl %edi, 8(%edx)
; FALLBACK24-NEXT:    movl %esi, 12(%edx)
; FALLBACK24-NEXT:    addl $60, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: shl_16bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $44, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK25-NEXT:    vmovups (%edx), %xmm0
; FALLBACK25-NEXT:    movzbl (%ecx), %edx
; FALLBACK25-NEXT:    movl %edx, %ecx
; FALLBACK25-NEXT:    shlb $3, %cl
; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK25-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    andb $12, %dl
; FALLBACK25-NEXT:    negb %dl
; FALLBACK25-NEXT:    movsbl %dl, %edi
; FALLBACK25-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK25-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK25-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
; FALLBACK25-NEXT:    movl %ebx, %ebp
; FALLBACK25-NEXT:    shll %cl, %ebp
; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK25-NEXT:    movl %edi, 4(%eax)
; FALLBACK25-NEXT:    movl %esi, 8(%eax)
; FALLBACK25-NEXT:    movl %edx, 12(%eax)
; FALLBACK25-NEXT:    movl %ebp, (%eax)
; FALLBACK25-NEXT:    addl $44, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: shl_16bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $44, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK26-NEXT:    movzbl (%eax), %ecx
; FALLBACK26-NEXT:    movl %ecx, %eax
; FALLBACK26-NEXT:    shlb $3, %al
; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK26-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    andb $12, %cl
; FALLBACK26-NEXT:    negb %cl
; FALLBACK26-NEXT:    movsbl %cl, %ecx
; FALLBACK26-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
; FALLBACK26-NEXT:    movl 24(%esp,%ecx), %edx
; FALLBACK26-NEXT:    shlxl %eax, %edx, %edi
; FALLBACK26-NEXT:    movl %eax, %ebx
; FALLBACK26-NEXT:    notb %bl
; FALLBACK26-NEXT:    shrl %edx
; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK26-NEXT:    orl %esi, %edx
; FALLBACK26-NEXT:    movl 20(%esp,%ecx), %esi
; FALLBACK26-NEXT:    movl %esi, %ebp
; FALLBACK26-NEXT:    shrl %ebp
; FALLBACK26-NEXT:    shrxl %ebx, %ebp, %ebp
; FALLBACK26-NEXT:    orl %edi, %ebp
; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK26-NEXT:    movl 16(%esp,%ecx), %ecx
; FALLBACK26-NEXT:    shlxl %eax, %ecx, %eax
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %esi, %ecx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK26-NEXT:    movl %eax, (%esi)
; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
; FALLBACK26-NEXT:    movl %edx, 12(%esi)
; FALLBACK26-NEXT:    addl $44, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: shl_16bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $44, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK27-NEXT:    vmovups (%edx), %xmm0
; FALLBACK27-NEXT:    movzbl (%ecx), %edx
; FALLBACK27-NEXT:    movl %edx, %ecx
; FALLBACK27-NEXT:    shlb $3, %cl
; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK27-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    andb $12, %dl
; FALLBACK27-NEXT:    negb %dl
; FALLBACK27-NEXT:    movsbl %dl, %edi
; FALLBACK27-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK27-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK27-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK27-NEXT:    shldl %cl, %edi, %esi
; FALLBACK27-NEXT:    shlxl %ecx, %ebx, %ebp
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK27-NEXT:    movl %edi, 4(%eax)
; FALLBACK27-NEXT:    movl %esi, 8(%eax)
; FALLBACK27-NEXT:    movl %edx, 12(%eax)
; FALLBACK27-NEXT:    movl %ebp, (%eax)
; FALLBACK27-NEXT:    addl $44, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: shl_16bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $60, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK28-NEXT:    movzbl (%eax), %ecx
; FALLBACK28-NEXT:    movl %ecx, %eax
; FALLBACK28-NEXT:    shlb $3, %al
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    andb $12, %cl
; FALLBACK28-NEXT:    negb %cl
; FALLBACK28-NEXT:    movsbl %cl, %edi
; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    notb %dl
; FALLBACK28-NEXT:    movl 40(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %esi
; FALLBACK28-NEXT:    shrl %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    movl 32(%esp,%edi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 36(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %edi
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %ebp, %edi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK28-NEXT:    shrl %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    orl %ebx, %ebp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    movl %eax, (%edx)
; FALLBACK28-NEXT:    movl %ebp, 4(%edx)
; FALLBACK28-NEXT:    movl %edi, 8(%edx)
; FALLBACK28-NEXT:    movl %esi, 12(%edx)
; FALLBACK28-NEXT:    addl $60, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: shl_16bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $44, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK29-NEXT:    vmovups (%edx), %xmm0
; FALLBACK29-NEXT:    movzbl (%ecx), %edx
; FALLBACK29-NEXT:    movl %edx, %ecx
; FALLBACK29-NEXT:    shlb $3, %cl
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    andb $12, %dl
; FALLBACK29-NEXT:    negb %dl
; FALLBACK29-NEXT:    movsbl %dl, %edi
; FALLBACK29-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK29-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK29-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
; FALLBACK29-NEXT:    movl %ebx, %ebp
; FALLBACK29-NEXT:    shll %cl, %ebp
; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK29-NEXT:    movl %edi, 4(%eax)
; FALLBACK29-NEXT:    movl %esi, 8(%eax)
; FALLBACK29-NEXT:    movl %edx, 12(%eax)
; FALLBACK29-NEXT:    movl %ebp, (%eax)
; FALLBACK29-NEXT:    addl $44, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: shl_16bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $44, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK30-NEXT:    movzbl (%eax), %ecx
; FALLBACK30-NEXT:    movl %ecx, %eax
; FALLBACK30-NEXT:    shlb $3, %al
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    andb $12, %cl
; FALLBACK30-NEXT:    negb %cl
; FALLBACK30-NEXT:    movsbl %cl, %ecx
; FALLBACK30-NEXT:    shlxl %eax, 28(%esp,%ecx), %esi
; FALLBACK30-NEXT:    movl 24(%esp,%ecx), %edx
; FALLBACK30-NEXT:    shlxl %eax, %edx, %edi
; FALLBACK30-NEXT:    movl %eax, %ebx
; FALLBACK30-NEXT:    notb %bl
; FALLBACK30-NEXT:    shrl %edx
; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK30-NEXT:    orl %esi, %edx
; FALLBACK30-NEXT:    movl 20(%esp,%ecx), %esi
; FALLBACK30-NEXT:    movl %esi, %ebp
; FALLBACK30-NEXT:    shrl %ebp
; FALLBACK30-NEXT:    shrxl %ebx, %ebp, %ebp
; FALLBACK30-NEXT:    orl %edi, %ebp
; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK30-NEXT:    movl 16(%esp,%ecx), %ecx
; FALLBACK30-NEXT:    shlxl %eax, %ecx, %eax
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %esi, %ecx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK30-NEXT:    movl %eax, (%esi)
; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
; FALLBACK30-NEXT:    movl %edx, 12(%esi)
; FALLBACK30-NEXT:    addl $44, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: shl_16bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $44, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK31-NEXT:    vmovups (%edx), %xmm0
; FALLBACK31-NEXT:    movzbl (%ecx), %edx
; FALLBACK31-NEXT:    movl %edx, %ecx
; FALLBACK31-NEXT:    shlb $3, %cl
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovaps %xmm1, (%esp)
; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    andb $12, %dl
; FALLBACK31-NEXT:    negb %dl
; FALLBACK31-NEXT:    movsbl %dl, %edi
; FALLBACK31-NEXT:    movl 24(%esp,%edi), %esi
; FALLBACK31-NEXT:    movl 28(%esp,%edi), %edx
; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl 16(%esp,%edi), %ebx
; FALLBACK31-NEXT:    movl 20(%esp,%edi), %edi
; FALLBACK31-NEXT:    shldl %cl, %edi, %esi
; FALLBACK31-NEXT:    shlxl %ecx, %ebx, %ebp
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK31-NEXT:    movl %edi, 4(%eax)
; FALLBACK31-NEXT:    movl %esi, 8(%eax)
; FALLBACK31-NEXT:    movl %edx, 12(%eax)
; FALLBACK31-NEXT:    movl %ebp, (%eax)
; FALLBACK31-NEXT:    addl $44, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %byteOff = load i128, ptr %byteOff.ptr, align 1
  %bitOff = shl i128 %byteOff, 3
  %res = shl i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %rdi, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    xorl %ecx, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rcx
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rcx, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shldq %cl, %rax, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    xorl %eax, %eax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, 8(%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrq %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rdi, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %r8, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %r8, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shldq %cl, %rax, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlxq %rcx, %rax, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    xorl %esi, %esi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rax, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; X86-SSE2-LABEL: shl_16bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $32, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT:    movl (%edx), %esi
; X86-SSE2-NEXT:    movl 4(%edx), %edi
; X86-SSE2-NEXT:    movl 8(%edx), %ebx
; X86-SSE2-NEXT:    movl 12(%edx), %edx
; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, (%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    shlb $2, %cl
; X86-SSE2-NEXT:    andb $12, %cl
; X86-SSE2-NEXT:    negb %cl
; X86-SSE2-NEXT:    movsbl %cl, %ecx
; X86-SSE2-NEXT:    movl 16(%esp,%ecx), %edx
; X86-SSE2-NEXT:    movl 20(%esp,%ecx), %esi
; X86-SSE2-NEXT:    movl 28(%esp,%ecx), %edi
; X86-SSE2-NEXT:    movl 24(%esp,%ecx), %ecx
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl %edi, 12(%eax)
; X86-SSE2-NEXT:    movl %edx, (%eax)
; X86-SSE2-NEXT:    movl %esi, 4(%eax)
; X86-SSE2-NEXT:    addl $32, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: shl_16bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $44, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm1, %xmm1
; X86-SSE42-NEXT:    movaps %xmm1, (%esp)
; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    shlb $2, %cl
; X86-SSE42-NEXT:    andb $12, %cl
; X86-SSE42-NEXT:    negb %cl
; X86-SSE42-NEXT:    movsbl %cl, %ecx
; X86-SSE42-NEXT:    movups 16(%esp,%ecx), %xmm0
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $44, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: shl_16bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $44, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %xmm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovaps %xmm1, (%esp)
; X86-AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    shlb $2, %cl
; X86-AVX-NEXT:    andb $12, %cl
; X86-AVX-NEXT:    negb %cl
; X86-AVX-NEXT:    movsbl %cl, %ecx
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx), %xmm0
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $44, %esp
; X86-AVX-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
  %bitOff = shl i128 %dwordOff, 5
  %res = shl i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    sarq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rax, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rdi, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X86-NO-SHLD-NO-BMI2:       # %bb.0:
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebp
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    subl $60, %esp
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl (%ecx), %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 4(%ecx), %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 8(%ecx), %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 12(%ecx), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movb (%eax), %ah
; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ah, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    sarl $31, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %ah
; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %ah, %ebp
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebp), %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    notb %dl
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebp), %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ecx,%ecx), %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebp), %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    addl %esi, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebp), %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebx,%ebx), %ebp
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
; X86-NO-SHLD-NO-BMI2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, (%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
; X86-NO-SHLD-NO-BMI2-NEXT:    addl $60, %esp
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-NO-BMI2-NEXT:    popl %ebp
; X86-NO-SHLD-NO-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X86-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebp
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    subl $44, %esp
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%edx), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%edx), %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%edx), %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%edx), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb (%ecx), %ch
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, (%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl $31, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    andb $12, %ch
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movzbl %ch, %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 8(%esp,%ebx), %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl (%esp,%ebx), %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 4(%esp,%ebx), %ebp
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebp, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %esi, %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl 12(%esp,%ebx), %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebx, %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    shrdl %cl, %ebp, %edx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %esi, 8(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edx, (%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%eax)
; X86-HAVE-SHLD-NO-BMI2-NEXT:    addl $44, %esp
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebx
; X86-HAVE-SHLD-NO-BMI2-NEXT:    popl %ebp
; X86-HAVE-SHLD-NO-BMI2-NEXT:    retl
;
; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
; X86-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl (%ecx), %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%ecx), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%ecx), %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%ecx), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%eax), %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ebx, %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlb $3, %al
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarl $31, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    andb $12, %bl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movzbl %bl, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%esi), %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%esi), %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %edi, %ebp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    notb %dl
; X86-NO-SHLD-HAVE-BMI2-NEXT:    leal (%ebx,%ebx), %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %ecx, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, (%esp,%esi), %ebp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %edi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %edi, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebp, %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shrxl %eax, %ebx, %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%esi), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    sarxl %eax, %esi, %eax
; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl %esi, %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    shlxl %edx, %esi, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    orl %ebx, %edx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%esi)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edx, 8(%esi)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %edi, (%esi)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, 4(%esi)
; X86-NO-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebx
; X86-NO-SHLD-HAVE-BMI2-NEXT:    popl %ebp
; X86-NO-SHLD-HAVE-BMI2-NEXT:    retl
;
; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
; X86-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebp
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %ebx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    pushl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    subl $44, %esp
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%edx), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%edx), %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%edx), %ebx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%edx), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%ecx), %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, %ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $3, %cl
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, (%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarl $31, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    andb $12, %al
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl %al, %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 8(%esp,%eax), %ebx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl (%esp,%eax), %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 4(%esp,%eax), %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %esi, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %ebx, %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl 12(%esp,%eax), %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %eax, %ebx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %ebx, 8(%ebp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxl %ecx, %eax, %eax
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %eax, 12(%ebp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edx, (%ebp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    movl %edi, 4(%ebp)
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    addl $44, %esp
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %esi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %edi
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebx
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    popl %ebp
; X86-HAVE-SHLD-HAVE-BMI2-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %byteOff = load i128, ptr %byteOff.ptr, align 1
  %bitOff = shl i128 %byteOff, 3
  %res = ashr i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2:       # %bb.0:
; X64-NO-SHLD-NO-BMI2-NEXT:    movq (%rdi), %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %eax
; X64-NO-SHLD-NO-BMI2-NEXT:    shlb $5, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    leaq (%rdi,%rdi), %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    notb %cl
; X64-NO-SHLD-NO-BMI2-NEXT:    shlq %cl, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    orq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-NEXT:    sarq %cl, %r8
; X64-NO-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    testb $64, %al
; X64-NO-SHLD-NO-BMI2-NEXT:    cmovneq %r8, %rsi
; X64-NO-SHLD-NO-BMI2-NEXT:    cmoveq %r8, %rdi
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    movq %rsi, (%rdx)
; X64-NO-SHLD-NO-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2:       # %bb.0:
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq %cl, %rsi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    sarq $63, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-NO-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-NEXT:    retq
;
; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2:       # %bb.0:
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shrxq %rcx, (%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movl %ecx, %edi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    notb %dil
; X64-NO-SHLD-HAVE-BMI2-NEXT:    leaq (%rax,%rax), %r8
; X64-NO-SHLD-HAVE-BMI2-NEXT:    shlxq %rdi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    orq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rax, %rsi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rdi
; X64-NO-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rax
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-NEXT:    retq
;
; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2:       # %bb.0:
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq 8(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movzbl (%rsi), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    shrdq %cl, %rdi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarxq %rcx, %rdi, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    sarq $63, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    testb $64, %cl
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmovneq %rsi, %rax
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    cmoveq %rsi, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rdi, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT:    retq
;
; X86-SSE2-LABEL: ashr_16bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $32, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE2-NEXT:    movl (%edx), %esi
; X86-SSE2-NEXT:    movl 4(%edx), %edi
; X86-SSE2-NEXT:    movl 8(%edx), %ebx
; X86-SSE2-NEXT:    movl 12(%edx), %edx
; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, (%esp)
; X86-SSE2-NEXT:    sarl $31, %edx
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $3, %ecx
; X86-SSE2-NEXT:    movl (%esp,%ecx,4), %edx
; X86-SSE2-NEXT:    movl 4(%esp,%ecx,4), %esi
; X86-SSE2-NEXT:    movl 12(%esp,%ecx,4), %edi
; X86-SSE2-NEXT:    movl 8(%esp,%ecx,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl %edi, 12(%eax)
; X86-SSE2-NEXT:    movl %edx, (%eax)
; X86-SSE2-NEXT:    movl %esi, 4(%eax)
; X86-SSE2-NEXT:    addl $32, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: ashr_16bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    pushl %ebx
; X86-SSE42-NEXT:    pushl %edi
; X86-SSE42-NEXT:    pushl %esi
; X86-SSE42-NEXT:    subl $32, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movl (%edx), %esi
; X86-SSE42-NEXT:    movl 4(%edx), %edi
; X86-SSE42-NEXT:    movl 8(%edx), %ebx
; X86-SSE42-NEXT:    movl 12(%edx), %edx
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %esi, (%esp)
; X86-SSE42-NEXT:    sarl $31, %edx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    andl $3, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $32, %esp
; X86-SSE42-NEXT:    popl %esi
; X86-SSE42-NEXT:    popl %edi
; X86-SSE42-NEXT:    popl %ebx
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: ashr_16bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    pushl %ebx
; X86-AVX-NEXT:    pushl %edi
; X86-AVX-NEXT:    pushl %esi
; X86-AVX-NEXT:    subl $32, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    movl (%edx), %esi
; X86-AVX-NEXT:    movl 4(%edx), %edi
; X86-AVX-NEXT:    movl 8(%edx), %ebx
; X86-AVX-NEXT:    movl 12(%edx), %edx
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %esi, (%esp)
; X86-AVX-NEXT:    sarl $31, %edx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    andl $3, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $32, %esp
; X86-AVX-NEXT:    popl %esi
; X86-AVX-NEXT:    popl %edi
; X86-AVX-NEXT:    popl %ebx
; X86-AVX-NEXT:    retl
  %src = load i128, ptr %src.ptr, align 1
  %dwordOff = load i128, ptr %dwordOff.ptr, align 1
  %bitOff = shl i128 %dwordOff, 5
  %res = ashr i128 %src, %bitOff
  store i128 %res, ptr %dst, align 1
  ret void
}

define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: lshr_32bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    andb $24, %sil
; FALLBACK0-NEXT:    movzbl %sil, %r9d
; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r11, %r8
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %rdi, %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r9
; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %rdi, (%rdx)
; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: lshr_32bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    andb $24, %sil
; FALLBACK1-NEXT:    movzbl %sil, %eax
; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
; FALLBACK1-NEXT:    movq %r8, %r9
; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    shrq %cl, %rax
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
; FALLBACK1-NEXT:    movq %rdi, (%rdx)
; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: lshr_32bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    andb $24, %sil
; FALLBACK2-NEXT:    movzbl %sil, %ecx
; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    addq %rcx, %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, (%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: lshr_32bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    andb $24, %sil
; FALLBACK3-NEXT:    movzbl %sil, %eax
; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
; FALLBACK3-NEXT:    movq %r8, %r9
; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
; FALLBACK3-NEXT:    movq %rdi, (%rdx)
; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: lshr_32bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    andb $24, %cl
; FALLBACK4-NEXT:    movzbl %cl, %r9d
; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rdi
; FALLBACK4-NEXT:    orq %r10, %rdi
; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK4-NEXT:    movq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r11, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, (%rdx)
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: lshr_32bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movzbl (%rsi), %eax
; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    andb $24, %al
; FALLBACK5-NEXT:    movzbl %al, %eax
; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    movq %rdi, %r8
; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK5-NEXT:    movq %rax, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    shrq %cl, %rsi
; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: lshr_32bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    andb $24, %cl
; FALLBACK6-NEXT:    movzbl %cl, %ecx
; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r9, %rcx
; FALLBACK6-NEXT:    addq %r8, %r8
; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, (%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: lshr_32bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movzbl (%rsi), %eax
; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    andb $24, %al
; FALLBACK7-NEXT:    movzbl %al, %eax
; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    movq %rdi, %r8
; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK7-NEXT:    movq %rax, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK7-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: lshr_32bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    andb $24, %cl
; FALLBACK8-NEXT:    movzbl %cl, %r9d
; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rdi
; FALLBACK8-NEXT:    orq %r10, %rdi
; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r11, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, (%rdx)
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: lshr_32bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    movzbl (%rsi), %eax
; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    andb $24, %al
; FALLBACK9-NEXT:    movzbl %al, %eax
; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    movq %rdi, %r8
; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK9-NEXT:    movq %rax, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    shrq %cl, %rsi
; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: lshr_32bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    andb $24, %cl
; FALLBACK10-NEXT:    movzbl %cl, %ecx
; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r9, %rcx
; FALLBACK10-NEXT:    addq %r8, %r8
; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, (%rdx)
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: lshr_32bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    movzbl (%rsi), %eax
; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    andb $24, %al
; FALLBACK11-NEXT:    movzbl %al, %eax
; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    movq %rdi, %r8
; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK11-NEXT:    movq %rax, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK11-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: lshr_32bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    andb $24, %cl
; FALLBACK12-NEXT:    movzbl %cl, %r9d
; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rdi
; FALLBACK12-NEXT:    orq %r10, %rdi
; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r11, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, (%rdx)
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: lshr_32bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK13-NEXT:    movzbl (%rsi), %eax
; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    andb $24, %al
; FALLBACK13-NEXT:    movzbl %al, %eax
; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK13-NEXT:    movq %rdi, %r8
; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK13-NEXT:    movq %rax, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    shrq %cl, %rsi
; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: lshr_32bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    andb $24, %cl
; FALLBACK14-NEXT:    movzbl %cl, %ecx
; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    addq %rcx, %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r9, %rcx
; FALLBACK14-NEXT:    addq %r8, %r8
; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, (%rdx)
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: lshr_32bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK15-NEXT:    movzbl (%rsi), %eax
; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    andb $24, %al
; FALLBACK15-NEXT:    movzbl %al, %eax
; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    movq %rdi, %r8
; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK15-NEXT:    movq %rax, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK15-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: lshr_32bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $108, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK16-NEXT:    movl (%ebp), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%ebp), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%ebp), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 12(%ebp), %edi
; FALLBACK16-NEXT:    movl 16(%ebp), %ebx
; FALLBACK16-NEXT:    movb (%eax), %ah
; FALLBACK16-NEXT:    movl 20(%ebp), %esi
; FALLBACK16-NEXT:    movl 24(%ebp), %ecx
; FALLBACK16-NEXT:    movl 28(%ebp), %ebp
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movb %ah, %dh
; FALLBACK16-NEXT:    shlb $3, %dh
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    andb $28, %ah
; FALLBACK16-NEXT:    movzbl %ah, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
; FALLBACK16-NEXT:    movl %eax, %ebx
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movb %dh, %dl
; FALLBACK16-NEXT:    notb %dl
; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %ebx, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    movl %eax, %ebx
; FALLBACK16-NEXT:    addl %eax, %ebx
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %esi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
; FALLBACK16-NEXT:    movl %ebp, %esi
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    movl 48(%esp,%eax), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %esi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    addl %ebp, %ebp
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %edi, %ebp
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl 52(%esp,%eax), %edi
; FALLBACK16-NEXT:    movl %edi, %ebx
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl 56(%esp,%eax), %esi
; FALLBACK16-NEXT:    leal (%esi,%esi), %eax
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %ebx, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    addl %edi, %edi
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %ebx, %edi
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    movl %esi, %eax
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl 60(%esp,%ecx), %ebx
; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    orl %eax, %esi
; FALLBACK16-NEXT:    movb %dh, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
; FALLBACK16-NEXT:    movl %esi, 24(%eax)
; FALLBACK16-NEXT:    movl %edi, 16(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
; FALLBACK16-NEXT:    movl %ebp, 8(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, (%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
; FALLBACK16-NEXT:    addl $108, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: lshr_32bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $92, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl (%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%ebp), %esi
; FALLBACK17-NEXT:    movl 12(%ebp), %edi
; FALLBACK17-NEXT:    movl 16(%ebp), %ebx
; FALLBACK17-NEXT:    movb (%ecx), %ch
; FALLBACK17-NEXT:    movl 20(%ebp), %edx
; FALLBACK17-NEXT:    movl 24(%ebp), %eax
; FALLBACK17-NEXT:    movl 28(%ebp), %ebp
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movb %ch, %cl
; FALLBACK17-NEXT:    shlb $3, %cl
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    andb $28, %ch
; FALLBACK17-NEXT:    movzbl %ch, %ebp
; FALLBACK17-NEXT:    movl 24(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 20(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%esp,%ebp), %ebx
; FALLBACK17-NEXT:    movl 28(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %esi
; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 36(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK17-NEXT:    movl 16(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl %edx, 24(%ebp)
; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
; FALLBACK17-NEXT:    shrl %cl, %eax
; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
; FALLBACK17-NEXT:    movl %ebx, 16(%ebp)
; FALLBACK17-NEXT:    movl %edi, 20(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
; FALLBACK17-NEXT:    movl %esi, (%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
; FALLBACK17-NEXT:    addl $92, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: lshr_32bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $108, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%eax), %esi
; FALLBACK18-NEXT:    movl 12(%eax), %edi
; FALLBACK18-NEXT:    movl 16(%eax), %ebp
; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
; FALLBACK18-NEXT:    movl 20(%eax), %edx
; FALLBACK18-NEXT:    movl 24(%eax), %ecx
; FALLBACK18-NEXT:    movl 28(%eax), %eax
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, %eax
; FALLBACK18-NEXT:    shlb $3, %al
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    andb $28, %bl
; FALLBACK18-NEXT:    movzbl %bl, %edi
; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %eax, %esi, %edx
; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl %eax, %edx
; FALLBACK18-NEXT:    movl %eax, %ebx
; FALLBACK18-NEXT:    notb %dl
; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl %ebx, %ecx
; FALLBACK18-NEXT:    shrxl %ebx, 32(%esp,%edi), %ebx
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
; FALLBACK18-NEXT:    orl %ebx, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 48(%esp,%edi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %ebx
; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
; FALLBACK18-NEXT:    movl %ecx, %eax
; FALLBACK18-NEXT:    shrxl %ecx, %ebp, %ebx
; FALLBACK18-NEXT:    orl %ebx, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %eax, %ebx
; FALLBACK18-NEXT:    addl %ebp, %ebp
; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
; FALLBACK18-NEXT:    orl %ecx, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
; FALLBACK18-NEXT:    orl %esi, %ecx
; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    addl %eax, %eax
; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK18-NEXT:    shrxl %ebx, %edi, %ebx
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %edx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
; FALLBACK18-NEXT:    movl %edi, 24(%eax)
; FALLBACK18-NEXT:    movl %esi, 16(%eax)
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, (%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $108, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: lshr_32bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $92, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%ecx), %esi
; FALLBACK19-NEXT:    movl 12(%ecx), %edi
; FALLBACK19-NEXT:    movl 16(%ecx), %ebp
; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
; FALLBACK19-NEXT:    movl 20(%ecx), %edx
; FALLBACK19-NEXT:    movl 24(%ecx), %eax
; FALLBACK19-NEXT:    movl 28(%ecx), %ecx
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, %ecx
; FALLBACK19-NEXT:    shlb $3, %cl
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    andb $28, %bl
; FALLBACK19-NEXT:    movzbl %bl, %ebp
; FALLBACK19-NEXT:    movl 24(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 20(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %esi, %eax
; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%esp,%ebp), %ebx
; FALLBACK19-NEXT:    movl 28(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl 36(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl %edx, %esi
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK19-NEXT:    movl 16(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl 44(%esp,%ebp), %edi
; FALLBACK19-NEXT:    shrdl %cl, %edi, %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
; FALLBACK19-NEXT:    shrxl %ecx, %edi, %eax
; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
; FALLBACK19-NEXT:    movl %ebx, 16(%ebp)
; FALLBACK19-NEXT:    movl %esi, 20(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    movl %edx, (%ebp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
; FALLBACK19-NEXT:    addl $92, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: lshr_32bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $108, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK20-NEXT:    movzbl (%eax), %ecx
; FALLBACK20-NEXT:    movl %ecx, %eax
; FALLBACK20-NEXT:    shlb $3, %al
; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    andb $28, %cl
; FALLBACK20-NEXT:    movzbl %cl, %ecx
; FALLBACK20-NEXT:    movl 32(%esp,%ecx), %esi
; FALLBACK20-NEXT:    movl 36(%esp,%ecx), %ebx
; FALLBACK20-NEXT:    movl %ecx, %edi
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    notb %dl
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %esi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %esi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %esi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK20-NEXT:    movl %esi, %ebx
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %ebp, %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %ebx, %ebp
; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %ebx
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK20-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    orl %ebx, %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %ebp, %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %edi, %ebp
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl 60(%esp,%ecx), %ebx
; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl %ebx, 28(%eax)
; FALLBACK20-NEXT:    movl %esi, 4(%eax)
; FALLBACK20-NEXT:    movl %edi, 24(%eax)
; FALLBACK20-NEXT:    movl %ebp, 16(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, (%eax)
; FALLBACK20-NEXT:    addl $108, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: lshr_32bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $108, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movups (%ecx), %xmm0
; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK21-NEXT:    movzbl (%eax), %eax
; FALLBACK21-NEXT:    movl %eax, %ecx
; FALLBACK21-NEXT:    shlb $3, %cl
; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    andb $28, %al
; FALLBACK21-NEXT:    movzbl %al, %ebp
; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK21-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK21-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl %edi, %esi
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK21-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %esi, 4(%ebp)
; FALLBACK21-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK21-NEXT:    shrl %cl, %eax
; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
; FALLBACK21-NEXT:    movl %edx, (%ebp)
; FALLBACK21-NEXT:    addl $108, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: lshr_32bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $108, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK22-NEXT:    movzbl (%eax), %ecx
; FALLBACK22-NEXT:    movl %ecx, %edx
; FALLBACK22-NEXT:    shlb $3, %dl
; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    andb $28, %cl
; FALLBACK22-NEXT:    movzbl %cl, %edi
; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
; FALLBACK22-NEXT:    movl %edx, %eax
; FALLBACK22-NEXT:    notb %al
; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK22-NEXT:    movl %eax, %ebp
; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
; FALLBACK22-NEXT:    orl %ebx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
; FALLBACK22-NEXT:    orl %ebx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
; FALLBACK22-NEXT:    movl %ebp, %ecx
; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %ebx, %ebx
; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
; FALLBACK22-NEXT:    orl %ebp, %ebx
; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    movl %ecx, %edx
; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK22-NEXT:    orl %ebp, %edi
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %esi, %ecx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK22-NEXT:    movl %eax, 28(%edx)
; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
; FALLBACK22-NEXT:    movl %edi, 24(%edx)
; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 20(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 8(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 12(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, (%edx)
; FALLBACK22-NEXT:    addl $108, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: lshr_32bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $108, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movups (%ecx), %xmm0
; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK23-NEXT:    movzbl (%eax), %eax
; FALLBACK23-NEXT:    movl %eax, %ecx
; FALLBACK23-NEXT:    shlb $3, %cl
; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    andb $28, %al
; FALLBACK23-NEXT:    movzbl %al, %ebx
; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK23-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, %edi
; FALLBACK23-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK23-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl %ebx, 4(%eax)
; FALLBACK23-NEXT:    movl %ebp, 24(%eax)
; FALLBACK23-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK23-NEXT:    movl %ebx, 28(%eax)
; FALLBACK23-NEXT:    movl %esi, 16(%eax)
; FALLBACK23-NEXT:    movl %edi, 20(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    movl %esi, 8(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    movl %esi, 12(%eax)
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, (%eax)
; FALLBACK23-NEXT:    addl $108, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: lshr_32bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $108, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK24-NEXT:    movzbl (%eax), %ecx
; FALLBACK24-NEXT:    movl %ecx, %eax
; FALLBACK24-NEXT:    shlb $3, %al
; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    andb $28, %cl
; FALLBACK24-NEXT:    movzbl %cl, %ecx
; FALLBACK24-NEXT:    movl 32(%esp,%ecx), %esi
; FALLBACK24-NEXT:    movl 36(%esp,%ecx), %ebx
; FALLBACK24-NEXT:    movl %ecx, %edi
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    notb %dl
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %esi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %esi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %esi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK24-NEXT:    movl %esi, %ebx
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %ebp, %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %ebx, %ebp
; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %ebx
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK24-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    orl %ebx, %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %ebp, %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %edi, %ebp
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl 60(%esp,%ecx), %ebx
; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl %ebx, 28(%eax)
; FALLBACK24-NEXT:    movl %esi, 4(%eax)
; FALLBACK24-NEXT:    movl %edi, 24(%eax)
; FALLBACK24-NEXT:    movl %ebp, 16(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, (%eax)
; FALLBACK24-NEXT:    addl $108, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    vzeroupper
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: lshr_32bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $108, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK25-NEXT:    movzbl (%eax), %eax
; FALLBACK25-NEXT:    movl %eax, %ecx
; FALLBACK25-NEXT:    shlb $3, %cl
; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    andb $28, %al
; FALLBACK25-NEXT:    movzbl %al, %ebp
; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK25-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK25-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl %edi, %esi
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK25-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %esi, 4(%ebp)
; FALLBACK25-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK25-NEXT:    shrl %cl, %eax
; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
; FALLBACK25-NEXT:    movl %edx, (%ebp)
; FALLBACK25-NEXT:    addl $108, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    vzeroupper
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: lshr_32bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $108, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK26-NEXT:    movzbl (%eax), %ecx
; FALLBACK26-NEXT:    movl %ecx, %edx
; FALLBACK26-NEXT:    shlb $3, %dl
; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    andb $28, %cl
; FALLBACK26-NEXT:    movzbl %cl, %edi
; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
; FALLBACK26-NEXT:    movl %edx, %eax
; FALLBACK26-NEXT:    notb %al
; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK26-NEXT:    orl %ecx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK26-NEXT:    movl %eax, %ebp
; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
; FALLBACK26-NEXT:    orl %ebx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
; FALLBACK26-NEXT:    orl %ebx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
; FALLBACK26-NEXT:    movl %ebp, %ecx
; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %ebx, %ebx
; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
; FALLBACK26-NEXT:    orl %ebp, %ebx
; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    movl %ecx, %edx
; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK26-NEXT:    orl %ebp, %edi
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %esi, %ecx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK26-NEXT:    movl %eax, 28(%edx)
; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
; FALLBACK26-NEXT:    movl %edi, 24(%edx)
; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 20(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 8(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 12(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, (%edx)
; FALLBACK26-NEXT:    addl $108, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    vzeroupper
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: lshr_32bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $108, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK27-NEXT:    movzbl (%eax), %eax
; FALLBACK27-NEXT:    movl %eax, %ecx
; FALLBACK27-NEXT:    shlb $3, %cl
; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    andb $28, %al
; FALLBACK27-NEXT:    movzbl %al, %ebx
; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK27-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, %edi
; FALLBACK27-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK27-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl %ebx, 4(%eax)
; FALLBACK27-NEXT:    movl %ebp, 24(%eax)
; FALLBACK27-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK27-NEXT:    movl %ebx, 28(%eax)
; FALLBACK27-NEXT:    movl %esi, 16(%eax)
; FALLBACK27-NEXT:    movl %edi, 20(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    movl %esi, 8(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    movl %esi, 12(%eax)
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, (%eax)
; FALLBACK27-NEXT:    addl $108, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    vzeroupper
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: lshr_32bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $108, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK28-NEXT:    movzbl (%eax), %ecx
; FALLBACK28-NEXT:    movl %ecx, %eax
; FALLBACK28-NEXT:    shlb $3, %al
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    andb $28, %cl
; FALLBACK28-NEXT:    movzbl %cl, %ecx
; FALLBACK28-NEXT:    movl 32(%esp,%ecx), %esi
; FALLBACK28-NEXT:    movl 36(%esp,%ecx), %ebx
; FALLBACK28-NEXT:    movl %ecx, %edi
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    notb %dl
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %esi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %esi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %esi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK28-NEXT:    movl %esi, %ebx
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %ebp, %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %ebx, %ebp
; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %ebx
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK28-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    orl %ebx, %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %ebp, %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %edi, %ebp
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl 60(%esp,%ecx), %ebx
; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl %ebx, 28(%eax)
; FALLBACK28-NEXT:    movl %esi, 4(%eax)
; FALLBACK28-NEXT:    movl %edi, 24(%eax)
; FALLBACK28-NEXT:    movl %ebp, 16(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, (%eax)
; FALLBACK28-NEXT:    addl $108, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    vzeroupper
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: lshr_32bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $108, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK29-NEXT:    movzbl (%eax), %eax
; FALLBACK29-NEXT:    movl %eax, %ecx
; FALLBACK29-NEXT:    shlb $3, %cl
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    andb $28, %al
; FALLBACK29-NEXT:    movzbl %al, %ebp
; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK29-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK29-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl %edi, %esi
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK29-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %esi, 4(%ebp)
; FALLBACK29-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK29-NEXT:    shrl %cl, %eax
; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
; FALLBACK29-NEXT:    movl %edx, (%ebp)
; FALLBACK29-NEXT:    addl $108, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    vzeroupper
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: lshr_32bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $108, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK30-NEXT:    movzbl (%eax), %ecx
; FALLBACK30-NEXT:    movl %ecx, %edx
; FALLBACK30-NEXT:    shlb $3, %dl
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    andb $28, %cl
; FALLBACK30-NEXT:    movzbl %cl, %edi
; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
; FALLBACK30-NEXT:    movl %edx, %eax
; FALLBACK30-NEXT:    notb %al
; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK30-NEXT:    orl %ecx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK30-NEXT:    movl %eax, %ebp
; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
; FALLBACK30-NEXT:    orl %ebx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
; FALLBACK30-NEXT:    orl %ebx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
; FALLBACK30-NEXT:    movl %ebp, %ecx
; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %ebx, %ebx
; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
; FALLBACK30-NEXT:    orl %ebp, %ebx
; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    movl %ecx, %edx
; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
; FALLBACK30-NEXT:    orl %ebp, %edi
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %esi, %ecx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK30-NEXT:    movl %eax, 28(%edx)
; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
; FALLBACK30-NEXT:    movl %edi, 24(%edx)
; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 20(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 8(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 12(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, (%edx)
; FALLBACK30-NEXT:    addl $108, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    vzeroupper
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: lshr_32bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $108, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK31-NEXT:    movzbl (%eax), %eax
; FALLBACK31-NEXT:    movl %eax, %ecx
; FALLBACK31-NEXT:    shlb $3, %cl
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    andb $28, %al
; FALLBACK31-NEXT:    movzbl %al, %ebx
; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK31-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, %edi
; FALLBACK31-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK31-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl %ebx, 4(%eax)
; FALLBACK31-NEXT:    movl %ebp, 24(%eax)
; FALLBACK31-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK31-NEXT:    movl %ebx, 28(%eax)
; FALLBACK31-NEXT:    movl %esi, 16(%eax)
; FALLBACK31-NEXT:    movl %edi, 20(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    movl %esi, 8(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    movl %esi, 12(%eax)
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, (%eax)
; FALLBACK31-NEXT:    addl $108, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    vzeroupper
; FALLBACK31-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %byteOff = load i256, ptr %byteOff.ptr, align 1
  %bitOff = shl i256 %byteOff, 3
  %res = lshr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: lshr_32bytes_dwordOff:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    movl %esi, %eax
; FALLBACK0-NEXT:    shlb $5, %al
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    andb $6, %sil
; FALLBACK0-NEXT:    movzbl %sil, %r9d
; FALLBACK0-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK0-NEXT:    movq -56(%rsp,%r9,4), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -48(%rsp,%r9,4), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r11, %r8
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %rdi, %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r9
; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %rdi, (%rdx)
; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: lshr_32bytes_dwordOff:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    movl %esi, %ecx
; FALLBACK1-NEXT:    shlb $5, %cl
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    andb $6, %sil
; FALLBACK1-NEXT:    movzbl %sil, %eax
; FALLBACK1-NEXT:    movq -56(%rsp,%rax,4), %rsi
; FALLBACK1-NEXT:    movq -72(%rsp,%rax,4), %rdi
; FALLBACK1-NEXT:    movq -64(%rsp,%rax,4), %r8
; FALLBACK1-NEXT:    movq %r8, %r9
; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK1-NEXT:    movq -48(%rsp,%rax,4), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK1-NEXT:    shrq %cl, %rax
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
; FALLBACK1-NEXT:    movq %rdi, (%rdx)
; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: lshr_32bytes_dwordOff:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    movl %esi, %eax
; FALLBACK2-NEXT:    shlb $5, %al
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    andb $6, %sil
; FALLBACK2-NEXT:    movzbl %sil, %ecx
; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK2-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    addq %rcx, %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, (%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    movl %esi, %ecx
; FALLBACK3-NEXT:    shlb $5, %cl
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    andb $6, %sil
; FALLBACK3-NEXT:    movzbl %sil, %eax
; FALLBACK3-NEXT:    movq -56(%rsp,%rax,4), %rsi
; FALLBACK3-NEXT:    movq -72(%rsp,%rax,4), %rdi
; FALLBACK3-NEXT:    movq -64(%rsp,%rax,4), %r8
; FALLBACK3-NEXT:    movq %r8, %r9
; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK3-NEXT:    movq -48(%rsp,%rax,4), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
; FALLBACK3-NEXT:    movq %rdi, (%rdx)
; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: lshr_32bytes_dwordOff:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
; FALLBACK4-NEXT:    movl %ecx, %eax
; FALLBACK4-NEXT:    shlb $5, %al
; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    andb $6, %cl
; FALLBACK4-NEXT:    movzbl %cl, %r9d
; FALLBACK4-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK4-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rdi
; FALLBACK4-NEXT:    orq %r10, %rdi
; FALLBACK4-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK4-NEXT:    movq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r11, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, (%rdx)
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: lshr_32bytes_dwordOff:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movzbl (%rsi), %eax
; FALLBACK5-NEXT:    movl %eax, %ecx
; FALLBACK5-NEXT:    shlb $5, %cl
; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    andb $6, %al
; FALLBACK5-NEXT:    movzbl %al, %eax
; FALLBACK5-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK5-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK5-NEXT:    movq %rdi, %r8
; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK5-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK5-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK5-NEXT:    movq %rax, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK5-NEXT:    shrq %cl, %rsi
; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: lshr_32bytes_dwordOff:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
; FALLBACK6-NEXT:    movl %ecx, %eax
; FALLBACK6-NEXT:    shlb $5, %al
; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    andb $6, %cl
; FALLBACK6-NEXT:    movzbl %cl, %ecx
; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK6-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r9, %rcx
; FALLBACK6-NEXT:    addq %r8, %r8
; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, (%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movzbl (%rsi), %eax
; FALLBACK7-NEXT:    movl %eax, %ecx
; FALLBACK7-NEXT:    shlb $5, %cl
; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    andb $6, %al
; FALLBACK7-NEXT:    movzbl %al, %eax
; FALLBACK7-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK7-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK7-NEXT:    movq %rdi, %r8
; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK7-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK7-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK7-NEXT:    movq %rax, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK7-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: lshr_32bytes_dwordOff:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
; FALLBACK8-NEXT:    movl %ecx, %eax
; FALLBACK8-NEXT:    shlb $5, %al
; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    andb $6, %cl
; FALLBACK8-NEXT:    movzbl %cl, %r9d
; FALLBACK8-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK8-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rdi
; FALLBACK8-NEXT:    orq %r10, %rdi
; FALLBACK8-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK8-NEXT:    movq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r11, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, (%rdx)
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: lshr_32bytes_dwordOff:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    movzbl (%rsi), %eax
; FALLBACK9-NEXT:    movl %eax, %ecx
; FALLBACK9-NEXT:    shlb $5, %cl
; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    andb $6, %al
; FALLBACK9-NEXT:    movzbl %al, %eax
; FALLBACK9-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK9-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK9-NEXT:    movq %rdi, %r8
; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK9-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK9-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK9-NEXT:    movq %rax, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK9-NEXT:    shrq %cl, %rsi
; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
; FALLBACK10-NEXT:    movl %ecx, %eax
; FALLBACK10-NEXT:    shlb $5, %al
; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    andb $6, %cl
; FALLBACK10-NEXT:    movzbl %cl, %ecx
; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK10-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r9, %rcx
; FALLBACK10-NEXT:    addq %r8, %r8
; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, (%rdx)
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: lshr_32bytes_dwordOff:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    movzbl (%rsi), %eax
; FALLBACK11-NEXT:    movl %eax, %ecx
; FALLBACK11-NEXT:    shlb $5, %cl
; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    andb $6, %al
; FALLBACK11-NEXT:    movzbl %al, %eax
; FALLBACK11-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK11-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK11-NEXT:    movq %rdi, %r8
; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK11-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK11-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK11-NEXT:    movq %rax, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK11-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: lshr_32bytes_dwordOff:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
; FALLBACK12-NEXT:    movl %ecx, %eax
; FALLBACK12-NEXT:    shlb $5, %al
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    andb $6, %cl
; FALLBACK12-NEXT:    movzbl %cl, %r9d
; FALLBACK12-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK12-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rdi
; FALLBACK12-NEXT:    orq %r10, %rdi
; FALLBACK12-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK12-NEXT:    movq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r11, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, (%rdx)
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: lshr_32bytes_dwordOff:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK13-NEXT:    movzbl (%rsi), %eax
; FALLBACK13-NEXT:    movl %eax, %ecx
; FALLBACK13-NEXT:    shlb $5, %cl
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    andb $6, %al
; FALLBACK13-NEXT:    movzbl %al, %eax
; FALLBACK13-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK13-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK13-NEXT:    movq %rdi, %r8
; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK13-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK13-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK13-NEXT:    movq %rax, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK13-NEXT:    shrq %cl, %rsi
; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
; FALLBACK14-NEXT:    movl %ecx, %eax
; FALLBACK14-NEXT:    shlb $5, %al
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    andb $6, %cl
; FALLBACK14-NEXT:    movzbl %cl, %ecx
; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK14-NEXT:    shrxq %rax, %rcx, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    addq %rcx, %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r9, %rcx
; FALLBACK14-NEXT:    addq %r8, %r8
; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, (%rdx)
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: lshr_32bytes_dwordOff:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK15-NEXT:    movzbl (%rsi), %eax
; FALLBACK15-NEXT:    movl %eax, %ecx
; FALLBACK15-NEXT:    shlb $5, %cl
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    andb $6, %al
; FALLBACK15-NEXT:    movzbl %al, %eax
; FALLBACK15-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK15-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK15-NEXT:    movq %rdi, %r8
; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK15-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK15-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK15-NEXT:    movq %rax, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK15-NEXT:    shrxq %rcx, %rsi, %rax
; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %esi
; X86-SSE2-NEXT:    movl 12(%eax), %edi
; X86-SSE2-NEXT:    movl 16(%eax), %ebx
; X86-SSE2-NEXT:    movl 20(%eax), %ebp
; X86-SSE2-NEXT:    movl 24(%eax), %edx
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movzbl (%eax), %eax
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $7, %eax
; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl %edx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: lshr_32bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $76, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    andl $7, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $76, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: lshr_32bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $76, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %ymm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
; X86-AVX-NEXT:    andl $7, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $76, %esp
; X86-AVX-NEXT:    vzeroupper
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
  %bitOff = shl i256 %dwordOff, 5
  %res = lshr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
; X64-SSE2-NEXT:    movzbl (%rsi), %esi
; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    andl $3, %esi
; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rax, (%rdx)
; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
; X64-SSE42-NEXT:    movzbl (%rsi), %eax
; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    andl $3, %eax
; X64-SSE42-NEXT:    movups -72(%rsp,%rax,8), %xmm0
; X64-SSE42-NEXT:    movups -56(%rsp,%rax,8), %xmm1
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    retq
;
; X64-AVX-LABEL: lshr_32bytes_qwordOff:
; X64-AVX:       # %bb.0:
; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
; X64-AVX-NEXT:    movzbl (%rsi), %eax
; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    andl $3, %eax
; X64-AVX-NEXT:    vmovups -72(%rsp,%rax,8), %xmm0
; X64-AVX-NEXT:    vmovups -56(%rsp,%rax,8), %xmm1
; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX-NEXT:    vzeroupper
; X64-AVX-NEXT:    retq
;
; X86-SSE2-LABEL: lshr_32bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %esi
; X86-SSE2-NEXT:    movl 12(%eax), %edi
; X86-SSE2-NEXT:    movl 16(%eax), %ebx
; X86-SSE2-NEXT:    movl 20(%eax), %ebp
; X86-SSE2-NEXT:    movl 24(%eax), %edx
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movzbl (%eax), %eax
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $3, %eax
; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl %edx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: lshr_32bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $76, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    andl $3, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $76, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: lshr_32bytes_qwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $76, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %ymm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
; X86-AVX-NEXT:    andl $3, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $76, %esp
; X86-AVX-NEXT:    vzeroupper
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
  %bitOff = shl i256 %qwordOff, 6
  %res = lshr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: shl_32bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    andb $24, %sil
; FALLBACK0-NEXT:    negb %sil
; FALLBACK0-NEXT:    movsbq %sil, %r10
; FALLBACK0-NEXT:    movq -32(%rsp,%r10), %r8
; FALLBACK0-NEXT:    movq -24(%rsp,%r10), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq %r8, %r9
; FALLBACK0-NEXT:    shrq %r9
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r9
; FALLBACK0-NEXT:    orq %r11, %r9
; FALLBACK0-NEXT:    movq -8(%rsp,%r10), %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    movq -16(%rsp,%r10), %r10
; FALLBACK0-NEXT:    movq %r10, %rbx
; FALLBACK0-NEXT:    shrq %rbx
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    orq %r11, %rbx
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    shrq %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    movq %r8, (%rdx)
; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK0-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: shl_32bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    andb $24, %sil
; FALLBACK1-NEXT:    negb %sil
; FALLBACK1-NEXT:    movsbq %sil, %rax
; FALLBACK1-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK1-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK1-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shldq %cl, %r8, %rax
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    shlq %cl, %r8
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK1-NEXT:    movq %r8, (%rdx)
; FALLBACK1-NEXT:    movq %rax, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: shl_32bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    andb $24, %sil
; FALLBACK2-NEXT:    negb %sil
; FALLBACK2-NEXT:    movsbq %sil, %rsi
; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    shrq %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    shrq %rsi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    shrq %rcx
; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, (%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: shl_32bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    andb $24, %sil
; FALLBACK3-NEXT:    negb %sil
; FALLBACK3-NEXT:    movsbq %sil, %rax
; FALLBACK3-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK3-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK3-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shldq %cl, %r8, %rax
; FALLBACK3-NEXT:    shlxq %rcx, %r8, %rcx
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK3-NEXT:    movq %rcx, (%rdx)
; FALLBACK3-NEXT:    movq %rax, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: shl_32bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    andb $24, %cl
; FALLBACK4-NEXT:    negb %cl
; FALLBACK4-NEXT:    movsbq %cl, %r8
; FALLBACK4-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r9
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK4-NEXT:    movq %r10, %rdi
; FALLBACK4-NEXT:    shrq %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rdi
; FALLBACK4-NEXT:    orq %r9, %rdi
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK4-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK4-NEXT:    movq %r8, %r11
; FALLBACK4-NEXT:    shrq %r11
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    orq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r8
; FALLBACK4-NEXT:    movq %r9, %r10
; FALLBACK4-NEXT:    shrq %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, (%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %r11, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: shl_32bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movzbl (%rsi), %eax
; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    andb $24, %al
; FALLBACK5-NEXT:    negb %al
; FALLBACK5-NEXT:    movsbq %al, %rax
; FALLBACK5-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK5-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK5-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK5-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK5-NEXT:    movq %r8, %r9
; FALLBACK5-NEXT:    shlq %cl, %r9
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    shldq %cl, %r8, %rax
; FALLBACK5-NEXT:    movq %rax, 8(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: shl_32bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
; FALLBACK6-NEXT:    leal (,%rcx,8), %eax
; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    andb $24, %cl
; FALLBACK6-NEXT:    negb %cl
; FALLBACK6-NEXT:    movsbq %cl, %rcx
; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    shrq %rdi
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    shrq %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r8, %rcx
; FALLBACK6-NEXT:    shrq %r9
; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, (%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: shl_32bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movzbl (%rsi), %eax
; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    andb $24, %al
; FALLBACK7-NEXT:    negb %al
; FALLBACK7-NEXT:    movsbq %al, %rax
; FALLBACK7-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK7-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK7-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK7-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK7-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK7-NEXT:    shldq %cl, %r8, %rax
; FALLBACK7-NEXT:    movq %rax, 8(%rdx)
; FALLBACK7-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: shl_32bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    andb $24, %cl
; FALLBACK8-NEXT:    negb %cl
; FALLBACK8-NEXT:    movsbq %cl, %r8
; FALLBACK8-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r9
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK8-NEXT:    movq %r10, %rdi
; FALLBACK8-NEXT:    shrq %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rdi
; FALLBACK8-NEXT:    orq %r9, %rdi
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK8-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK8-NEXT:    movq %r8, %r11
; FALLBACK8-NEXT:    shrq %r11
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    orq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r8
; FALLBACK8-NEXT:    movq %r9, %r10
; FALLBACK8-NEXT:    shrq %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, (%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %r11, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: shl_32bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    movzbl (%rsi), %eax
; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    andb $24, %al
; FALLBACK9-NEXT:    negb %al
; FALLBACK9-NEXT:    movsbq %al, %rax
; FALLBACK9-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK9-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK9-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK9-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK9-NEXT:    movq %r8, %r9
; FALLBACK9-NEXT:    shlq %cl, %r9
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    shldq %cl, %r8, %rax
; FALLBACK9-NEXT:    movq %rax, 8(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: shl_32bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
; FALLBACK10-NEXT:    leal (,%rcx,8), %eax
; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    andb $24, %cl
; FALLBACK10-NEXT:    negb %cl
; FALLBACK10-NEXT:    movsbq %cl, %rcx
; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    shrq %rdi
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    shrq %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r8, %rcx
; FALLBACK10-NEXT:    shrq %r9
; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, (%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: shl_32bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    movzbl (%rsi), %eax
; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    andb $24, %al
; FALLBACK11-NEXT:    negb %al
; FALLBACK11-NEXT:    movsbq %al, %rax
; FALLBACK11-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK11-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK11-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK11-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK11-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK11-NEXT:    shldq %cl, %r8, %rax
; FALLBACK11-NEXT:    movq %rax, 8(%rdx)
; FALLBACK11-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: shl_32bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    andb $24, %cl
; FALLBACK12-NEXT:    negb %cl
; FALLBACK12-NEXT:    movsbq %cl, %r8
; FALLBACK12-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r9
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK12-NEXT:    movq %r10, %rdi
; FALLBACK12-NEXT:    shrq %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rdi
; FALLBACK12-NEXT:    orq %r9, %rdi
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK12-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK12-NEXT:    movq %r8, %r11
; FALLBACK12-NEXT:    shrq %r11
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    orq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r8
; FALLBACK12-NEXT:    movq %r9, %r10
; FALLBACK12-NEXT:    shrq %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, (%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %r11, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: shl_32bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK13-NEXT:    movzbl (%rsi), %eax
; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    andb $24, %al
; FALLBACK13-NEXT:    negb %al
; FALLBACK13-NEXT:    movsbq %al, %rax
; FALLBACK13-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK13-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK13-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK13-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK13-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK13-NEXT:    movq %r8, %r9
; FALLBACK13-NEXT:    shlq %cl, %r9
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    shldq %cl, %r8, %rax
; FALLBACK13-NEXT:    movq %rax, 8(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: shl_32bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
; FALLBACK14-NEXT:    leal (,%rcx,8), %eax
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    andb $24, %cl
; FALLBACK14-NEXT:    negb %cl
; FALLBACK14-NEXT:    movsbq %cl, %rcx
; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    shrq %rdi
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    shrq %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r8, %rcx
; FALLBACK14-NEXT:    shrq %r9
; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, (%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: shl_32bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK15-NEXT:    movzbl (%rsi), %eax
; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    andb $24, %al
; FALLBACK15-NEXT:    negb %al
; FALLBACK15-NEXT:    movsbq %al, %rax
; FALLBACK15-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK15-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK15-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK15-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK15-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK15-NEXT:    shldq %cl, %r8, %rax
; FALLBACK15-NEXT:    movq %rax, 8(%rdx)
; FALLBACK15-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: shl_32bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $108, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    movl (%ecx), %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%ecx), %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%ecx), %esi
; FALLBACK16-NEXT:    movl 12(%ecx), %edi
; FALLBACK16-NEXT:    movl 16(%ecx), %ebx
; FALLBACK16-NEXT:    movb (%eax), %ah
; FALLBACK16-NEXT:    movl 20(%ecx), %ebp
; FALLBACK16-NEXT:    movl 24(%ecx), %edx
; FALLBACK16-NEXT:    movl 28(%ecx), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movb %ah, %ch
; FALLBACK16-NEXT:    shlb $3, %ch
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    andb $28, %ah
; FALLBACK16-NEXT:    negb %ah
; FALLBACK16-NEXT:    movsbl %ah, %ebx
; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 68(%esp,%ebx), %eax
; FALLBACK16-NEXT:    movl %eax, %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    movb %ch, %dl
; FALLBACK16-NEXT:    notb %dl
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %esi, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %esi
; FALLBACK16-NEXT:    movl %esi, %ebp
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %edi, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    shrl %eax
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    orl %esi, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %esi
; FALLBACK16-NEXT:    movl %esi, %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %edi
; FALLBACK16-NEXT:    movl %edi, %ebp
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    shrl %eax
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    orl %edi, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 92(%esp,%ebx), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %edi
; FALLBACK16-NEXT:    movl %edi, %ebx
; FALLBACK16-NEXT:    shrl %ebx
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    orl %eax, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    shrl %esi
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    orl %edi, %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl %edx, (%eax)
; FALLBACK16-NEXT:    movl %esi, 24(%eax)
; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
; FALLBACK16-NEXT:    movl %ebp, 20(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
; FALLBACK16-NEXT:    addl $108, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: shl_32bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $92, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl (%eax), %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%eax), %edx
; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%eax), %esi
; FALLBACK17-NEXT:    movl 12(%eax), %edi
; FALLBACK17-NEXT:    movl 16(%eax), %ebx
; FALLBACK17-NEXT:    movb (%ecx), %ch
; FALLBACK17-NEXT:    movl 20(%eax), %ebp
; FALLBACK17-NEXT:    movl 24(%eax), %edx
; FALLBACK17-NEXT:    movl 28(%eax), %eax
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movb %ch, %cl
; FALLBACK17-NEXT:    shlb $3, %cl
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    andb $28, %ch
; FALLBACK17-NEXT:    negb %ch
; FALLBACK17-NEXT:    movsbl %ch, %eax
; FALLBACK17-NEXT:    movl 56(%esp,%eax), %edx
; FALLBACK17-NEXT:    movl 60(%esp,%eax), %ebx
; FALLBACK17-NEXT:    movl %ebx, %esi
; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 52(%esp,%eax), %esi
; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 64(%esp,%eax), %edi
; FALLBACK17-NEXT:    movl 68(%esp,%eax), %ebp
; FALLBACK17-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK17-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK17-NEXT:    movl 48(%esp,%eax), %ebx
; FALLBACK17-NEXT:    movl 72(%esp,%eax), %edx
; FALLBACK17-NEXT:    movl 76(%esp,%eax), %esi
; FALLBACK17-NEXT:    shldl %cl, %edx, %esi
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    shldl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl %edx, 24(%eax)
; FALLBACK17-NEXT:    movl %esi, 28(%eax)
; FALLBACK17-NEXT:    movl %edi, 16(%eax)
; FALLBACK17-NEXT:    movl %ebp, 20(%eax)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, 8(%eax)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, 12(%eax)
; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK17-NEXT:    shll %cl, %ebx
; FALLBACK17-NEXT:    movl %ebx, (%eax)
; FALLBACK17-NEXT:    movl %edx, 4(%eax)
; FALLBACK17-NEXT:    addl $92, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: shl_32bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $108, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%eax), %esi
; FALLBACK18-NEXT:    movl 12(%eax), %edi
; FALLBACK18-NEXT:    movl 16(%eax), %ebp
; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
; FALLBACK18-NEXT:    movl 20(%eax), %edx
; FALLBACK18-NEXT:    movl 24(%eax), %ecx
; FALLBACK18-NEXT:    movl 28(%eax), %eax
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, %edx
; FALLBACK18-NEXT:    shlb $3, %dl
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    andb $28, %bl
; FALLBACK18-NEXT:    negb %bl
; FALLBACK18-NEXT:    movsbl %bl, %esi
; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 68(%esp,%esi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
; FALLBACK18-NEXT:    movl %edx, %ecx
; FALLBACK18-NEXT:    notb %cl
; FALLBACK18-NEXT:    shrl %ebx
; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebx
; FALLBACK18-NEXT:    orl %edi, %ebx
; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 72(%esp,%esi), %ebx
; FALLBACK18-NEXT:    movl %ebx, %edi
; FALLBACK18-NEXT:    shrl %edi
; FALLBACK18-NEXT:    shrxl %ecx, %edi, %eax
; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
; FALLBACK18-NEXT:    shlxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebx
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    shrl %eax
; FALLBACK18-NEXT:    shrxl %ecx, %eax, %eax
; FALLBACK18-NEXT:    orl %ebx, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ebx
; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %ebx
; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %eax
; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %edi
; FALLBACK18-NEXT:    shrxl %ecx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, 92(%esp,%esi), %ebp
; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
; FALLBACK18-NEXT:    shrl %esi
; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
; FALLBACK18-NEXT:    orl %ebp, %esi
; FALLBACK18-NEXT:    shrl %ebx
; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %edx
; FALLBACK18-NEXT:    orl %eax, %edx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, (%eax)
; FALLBACK18-NEXT:    movl %edx, 24(%eax)
; FALLBACK18-NEXT:    movl %esi, 28(%eax)
; FALLBACK18-NEXT:    movl %edi, 16(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $108, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: shl_32bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $92, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%ecx), %esi
; FALLBACK19-NEXT:    movl 12(%ecx), %edi
; FALLBACK19-NEXT:    movl 16(%ecx), %ebp
; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
; FALLBACK19-NEXT:    movl 20(%ecx), %edx
; FALLBACK19-NEXT:    movl 24(%ecx), %eax
; FALLBACK19-NEXT:    movl 28(%ecx), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, %ecx
; FALLBACK19-NEXT:    shlb $3, %cl
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    andb $28, %bl
; FALLBACK19-NEXT:    negb %bl
; FALLBACK19-NEXT:    movsbl %bl, %eax
; FALLBACK19-NEXT:    movl 56(%esp,%eax), %edx
; FALLBACK19-NEXT:    movl 60(%esp,%eax), %esi
; FALLBACK19-NEXT:    movl %esi, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %edx, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 52(%esp,%eax), %ebx
; FALLBACK19-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 64(%esp,%eax), %edi
; FALLBACK19-NEXT:    movl 68(%esp,%eax), %ebp
; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl 48(%esp,%eax), %edx
; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 72(%esp,%eax), %edx
; FALLBACK19-NEXT:    movl 76(%esp,%eax), %esi
; FALLBACK19-NEXT:    shldl %cl, %edx, %esi
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    shldl %cl, %eax, %edx
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK19-NEXT:    movl %edx, 24(%eax)
; FALLBACK19-NEXT:    movl %esi, 28(%eax)
; FALLBACK19-NEXT:    movl %edi, 16(%eax)
; FALLBACK19-NEXT:    movl %ebp, 20(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, 8(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, 12(%eax)
; FALLBACK19-NEXT:    movl (%esp), %esi # 4-byte Reload
; FALLBACK19-NEXT:    shlxl %ecx, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, (%eax)
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK19-NEXT:    movl %ebx, 4(%eax)
; FALLBACK19-NEXT:    addl $92, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: shl_32bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $108, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK20-NEXT:    movzbl (%eax), %ecx
; FALLBACK20-NEXT:    movb %cl, %dh
; FALLBACK20-NEXT:    shlb $3, %dh
; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    andb $28, %cl
; FALLBACK20-NEXT:    negb %cl
; FALLBACK20-NEXT:    movsbl %cl, %eax
; FALLBACK20-NEXT:    movl 84(%esp,%eax), %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    movb %dh, %dl
; FALLBACK20-NEXT:    notb %dl
; FALLBACK20-NEXT:    movl 80(%esp,%eax), %esi
; FALLBACK20-NEXT:    movl %eax, %ebx
; FALLBACK20-NEXT:    movl %esi, %eax
; FALLBACK20-NEXT:    shrl %eax
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    orl %edi, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    movl %ebx, %edi
; FALLBACK20-NEXT:    movl 76(%esp,%ebx), %ebp
; FALLBACK20-NEXT:    movl %ebp, %eax
; FALLBACK20-NEXT:    shrl %eax
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    orl %esi, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    movl 72(%esp,%ebx), %ebx
; FALLBACK20-NEXT:    movl %ebx, %eax
; FALLBACK20-NEXT:    shrl %eax
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 68(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %esi
; FALLBACK20-NEXT:    shrl %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    movl 64(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    shrl %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    orl %ebp, %ebx
; FALLBACK20-NEXT:    movl 88(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %edi
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    shrl %eax
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    orl %edi, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl 92(%esp,%eax), %edi
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    shrl %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    orl %edi, %ebp
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl %edx, (%eax)
; FALLBACK20-NEXT:    movl %ebp, 28(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
; FALLBACK20-NEXT:    movl %ebx, 4(%eax)
; FALLBACK20-NEXT:    movl %esi, 8(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
; FALLBACK20-NEXT:    addl $108, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: shl_32bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $92, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movups (%ecx), %xmm0
; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK21-NEXT:    movzbl (%eax), %eax
; FALLBACK21-NEXT:    movl %eax, %ecx
; FALLBACK21-NEXT:    shlb $3, %cl
; FALLBACK21-NEXT:    xorps %xmm2, %xmm2
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    andb $28, %al
; FALLBACK21-NEXT:    negb %al
; FALLBACK21-NEXT:    movsbl %al, %ebp
; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %edx
; FALLBACK21-NEXT:    shldl %cl, %edx, %eax
; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edi
; FALLBACK21-NEXT:    shldl %cl, %edi, %edx
; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %ebx
; FALLBACK21-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl %edx, %eax
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK21-NEXT:    shldl %cl, %esi, %eax
; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %ebp
; FALLBACK21-NEXT:    shldl %cl, %edx, %ebp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK21-NEXT:    movl %ebp, 28(%edx)
; FALLBACK21-NEXT:    movl %eax, 24(%edx)
; FALLBACK21-NEXT:    movl %esi, %eax
; FALLBACK21-NEXT:    shll %cl, %eax
; FALLBACK21-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK21-NEXT:    movl %ebx, 4(%edx)
; FALLBACK21-NEXT:    movl %edi, 8(%edx)
; FALLBACK21-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK21-NEXT:    movl %ecx, 12(%edx)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK21-NEXT:    movl %ecx, 16(%edx)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK21-NEXT:    movl %ecx, 20(%edx)
; FALLBACK21-NEXT:    movl %eax, (%edx)
; FALLBACK21-NEXT:    addl $92, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: shl_32bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $108, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK22-NEXT:    movzbl (%eax), %ecx
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    shlb $3, %al
; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    andb $28, %cl
; FALLBACK22-NEXT:    negb %cl
; FALLBACK22-NEXT:    movsbl %cl, %edx
; FALLBACK22-NEXT:    movl 84(%esp,%edx), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK22-NEXT:    movl 80(%esp,%edx), %esi
; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK22-NEXT:    movl %eax, %ebx
; FALLBACK22-NEXT:    notb %bl
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 76(%esp,%edx), %ecx
; FALLBACK22-NEXT:    movl %ecx, %esi
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %edi, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK22-NEXT:    movl 72(%esp,%edx), %esi
; FALLBACK22-NEXT:    movl %esi, %edi
; FALLBACK22-NEXT:    shrl %edi
; FALLBACK22-NEXT:    shrxl %ebx, %edi, %edi
; FALLBACK22-NEXT:    orl %ecx, %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %eax, %esi, %ecx
; FALLBACK22-NEXT:    movl 68(%esp,%edx), %esi
; FALLBACK22-NEXT:    movl %esi, %edi
; FALLBACK22-NEXT:    shrl %edi
; FALLBACK22-NEXT:    shrxl %ebx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ecx, %ebp
; FALLBACK22-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK22-NEXT:    movl 64(%esp,%edx), %esi
; FALLBACK22-NEXT:    movl %esi, %ecx
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %edi, %ecx
; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
; FALLBACK22-NEXT:    movl 88(%esp,%edx), %edx
; FALLBACK22-NEXT:    shlxl %eax, %edx, %esi
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    shrl %eax
; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    shrl %edx
; FALLBACK22-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK22-NEXT:    orl %edi, %edx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK22-NEXT:    movl %edi, (%esi)
; FALLBACK22-NEXT:    movl %edx, 28(%esi)
; FALLBACK22-NEXT:    movl %eax, 24(%esi)
; FALLBACK22-NEXT:    movl %ecx, 4(%esi)
; FALLBACK22-NEXT:    movl %ebp, 8(%esi)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 12(%esi)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 16(%esi)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 20(%esi)
; FALLBACK22-NEXT:    addl $108, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: shl_32bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $92, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movups (%ecx), %xmm0
; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK23-NEXT:    movzbl (%eax), %eax
; FALLBACK23-NEXT:    movl %eax, %ecx
; FALLBACK23-NEXT:    shlb $3, %cl
; FALLBACK23-NEXT:    xorps %xmm2, %xmm2
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    andb $28, %al
; FALLBACK23-NEXT:    negb %al
; FALLBACK23-NEXT:    movsbl %al, %ebx
; FALLBACK23-NEXT:    movl 64(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl 68(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %eax, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %edx
; FALLBACK23-NEXT:    shldl %cl, %edx, %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %edi
; FALLBACK23-NEXT:    shldl %cl, %edi, %edx
; FALLBACK23-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %ebp
; FALLBACK23-NEXT:    shldl %cl, %ebp, %edi
; FALLBACK23-NEXT:    movl 72(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl %edx, %eax
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    shldl %cl, %esi, %eax
; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK23-NEXT:    movl 76(%esp,%ebx), %ebx
; FALLBACK23-NEXT:    shldl %cl, %edx, %ebx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK23-NEXT:    movl %ebx, 28(%edx)
; FALLBACK23-NEXT:    movl %eax, 24(%edx)
; FALLBACK23-NEXT:    shlxl %ecx, %esi, %eax
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    shldl %cl, %esi, %ebp
; FALLBACK23-NEXT:    movl %ebp, 4(%edx)
; FALLBACK23-NEXT:    movl %edi, 8(%edx)
; FALLBACK23-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 12(%edx)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 16(%edx)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 20(%edx)
; FALLBACK23-NEXT:    movl %eax, (%edx)
; FALLBACK23-NEXT:    addl $92, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: shl_32bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $108, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK24-NEXT:    movzbl (%eax), %ecx
; FALLBACK24-NEXT:    movb %cl, %dh
; FALLBACK24-NEXT:    shlb $3, %dh
; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    andb $28, %cl
; FALLBACK24-NEXT:    negb %cl
; FALLBACK24-NEXT:    movsbl %cl, %eax
; FALLBACK24-NEXT:    movl 84(%esp,%eax), %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    movb %dh, %dl
; FALLBACK24-NEXT:    notb %dl
; FALLBACK24-NEXT:    movl 80(%esp,%eax), %esi
; FALLBACK24-NEXT:    movl %eax, %ebx
; FALLBACK24-NEXT:    movl %esi, %eax
; FALLBACK24-NEXT:    shrl %eax
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    orl %edi, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    movl %ebx, %edi
; FALLBACK24-NEXT:    movl 76(%esp,%ebx), %ebp
; FALLBACK24-NEXT:    movl %ebp, %eax
; FALLBACK24-NEXT:    shrl %eax
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    orl %esi, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    movl 72(%esp,%ebx), %ebx
; FALLBACK24-NEXT:    movl %ebx, %eax
; FALLBACK24-NEXT:    shrl %eax
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 68(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %esi
; FALLBACK24-NEXT:    shrl %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    movl 64(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    shrl %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    orl %ebp, %ebx
; FALLBACK24-NEXT:    movl 88(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %edi
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    shrl %eax
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    orl %edi, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl 92(%esp,%eax), %edi
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    shrl %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    orl %edi, %ebp
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl %edx, (%eax)
; FALLBACK24-NEXT:    movl %ebp, 28(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
; FALLBACK24-NEXT:    movl %ebx, 4(%eax)
; FALLBACK24-NEXT:    movl %esi, 8(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
; FALLBACK24-NEXT:    addl $108, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    vzeroupper
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: shl_32bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $92, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK25-NEXT:    movzbl (%eax), %eax
; FALLBACK25-NEXT:    movl %eax, %ecx
; FALLBACK25-NEXT:    shlb $3, %cl
; FALLBACK25-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    andb $28, %al
; FALLBACK25-NEXT:    negb %al
; FALLBACK25-NEXT:    movsbl %al, %ebp
; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %edx
; FALLBACK25-NEXT:    shldl %cl, %edx, %eax
; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edi
; FALLBACK25-NEXT:    shldl %cl, %edi, %edx
; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %ebx
; FALLBACK25-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl %edx, %eax
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK25-NEXT:    shldl %cl, %esi, %eax
; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %ebp
; FALLBACK25-NEXT:    shldl %cl, %edx, %ebp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK25-NEXT:    movl %ebp, 28(%edx)
; FALLBACK25-NEXT:    movl %eax, 24(%edx)
; FALLBACK25-NEXT:    movl %esi, %eax
; FALLBACK25-NEXT:    shll %cl, %eax
; FALLBACK25-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK25-NEXT:    movl %ebx, 4(%edx)
; FALLBACK25-NEXT:    movl %edi, 8(%edx)
; FALLBACK25-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK25-NEXT:    movl %ecx, 12(%edx)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK25-NEXT:    movl %ecx, 16(%edx)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK25-NEXT:    movl %ecx, 20(%edx)
; FALLBACK25-NEXT:    movl %eax, (%edx)
; FALLBACK25-NEXT:    addl $92, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    vzeroupper
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: shl_32bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $108, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK26-NEXT:    movzbl (%eax), %ecx
; FALLBACK26-NEXT:    movl %ecx, %eax
; FALLBACK26-NEXT:    shlb $3, %al
; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    andb $28, %cl
; FALLBACK26-NEXT:    negb %cl
; FALLBACK26-NEXT:    movsbl %cl, %edx
; FALLBACK26-NEXT:    movl 84(%esp,%edx), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK26-NEXT:    movl 80(%esp,%edx), %esi
; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK26-NEXT:    movl %eax, %ebx
; FALLBACK26-NEXT:    notb %bl
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %ecx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 76(%esp,%edx), %ecx
; FALLBACK26-NEXT:    movl %ecx, %esi
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %edi, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK26-NEXT:    movl 72(%esp,%edx), %esi
; FALLBACK26-NEXT:    movl %esi, %edi
; FALLBACK26-NEXT:    shrl %edi
; FALLBACK26-NEXT:    shrxl %ebx, %edi, %edi
; FALLBACK26-NEXT:    orl %ecx, %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %eax, %esi, %ecx
; FALLBACK26-NEXT:    movl 68(%esp,%edx), %esi
; FALLBACK26-NEXT:    movl %esi, %edi
; FALLBACK26-NEXT:    shrl %edi
; FALLBACK26-NEXT:    shrxl %ebx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ecx, %ebp
; FALLBACK26-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK26-NEXT:    movl 64(%esp,%edx), %esi
; FALLBACK26-NEXT:    movl %esi, %ecx
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %edi, %ecx
; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
; FALLBACK26-NEXT:    movl 88(%esp,%edx), %edx
; FALLBACK26-NEXT:    shlxl %eax, %edx, %esi
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    shrl %eax
; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    shrl %edx
; FALLBACK26-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK26-NEXT:    orl %edi, %edx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK26-NEXT:    movl %edi, (%esi)
; FALLBACK26-NEXT:    movl %edx, 28(%esi)
; FALLBACK26-NEXT:    movl %eax, 24(%esi)
; FALLBACK26-NEXT:    movl %ecx, 4(%esi)
; FALLBACK26-NEXT:    movl %ebp, 8(%esi)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 12(%esi)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 16(%esi)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 20(%esi)
; FALLBACK26-NEXT:    addl $108, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    vzeroupper
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: shl_32bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $92, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK27-NEXT:    movzbl (%eax), %eax
; FALLBACK27-NEXT:    movl %eax, %ecx
; FALLBACK27-NEXT:    shlb $3, %cl
; FALLBACK27-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    andb $28, %al
; FALLBACK27-NEXT:    negb %al
; FALLBACK27-NEXT:    movsbl %al, %ebx
; FALLBACK27-NEXT:    movl 64(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl 68(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %eax, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %edx
; FALLBACK27-NEXT:    shldl %cl, %edx, %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %edi
; FALLBACK27-NEXT:    shldl %cl, %edi, %edx
; FALLBACK27-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %ebp
; FALLBACK27-NEXT:    shldl %cl, %ebp, %edi
; FALLBACK27-NEXT:    movl 72(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl %edx, %eax
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    shldl %cl, %esi, %eax
; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK27-NEXT:    movl 76(%esp,%ebx), %ebx
; FALLBACK27-NEXT:    shldl %cl, %edx, %ebx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK27-NEXT:    movl %ebx, 28(%edx)
; FALLBACK27-NEXT:    movl %eax, 24(%edx)
; FALLBACK27-NEXT:    shlxl %ecx, %esi, %eax
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    shldl %cl, %esi, %ebp
; FALLBACK27-NEXT:    movl %ebp, 4(%edx)
; FALLBACK27-NEXT:    movl %edi, 8(%edx)
; FALLBACK27-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 12(%edx)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 16(%edx)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 20(%edx)
; FALLBACK27-NEXT:    movl %eax, (%edx)
; FALLBACK27-NEXT:    addl $92, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    vzeroupper
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: shl_32bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $108, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK28-NEXT:    movzbl (%eax), %ecx
; FALLBACK28-NEXT:    movb %cl, %dh
; FALLBACK28-NEXT:    shlb $3, %dh
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    andb $28, %cl
; FALLBACK28-NEXT:    negb %cl
; FALLBACK28-NEXT:    movsbl %cl, %eax
; FALLBACK28-NEXT:    movl 84(%esp,%eax), %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    movb %dh, %dl
; FALLBACK28-NEXT:    notb %dl
; FALLBACK28-NEXT:    movl 80(%esp,%eax), %esi
; FALLBACK28-NEXT:    movl %eax, %ebx
; FALLBACK28-NEXT:    movl %esi, %eax
; FALLBACK28-NEXT:    shrl %eax
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    orl %edi, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    movl %ebx, %edi
; FALLBACK28-NEXT:    movl 76(%esp,%ebx), %ebp
; FALLBACK28-NEXT:    movl %ebp, %eax
; FALLBACK28-NEXT:    shrl %eax
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    orl %esi, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    movl 72(%esp,%ebx), %ebx
; FALLBACK28-NEXT:    movl %ebx, %eax
; FALLBACK28-NEXT:    shrl %eax
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 68(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %esi
; FALLBACK28-NEXT:    shrl %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    movl 64(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    shrl %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    orl %ebp, %ebx
; FALLBACK28-NEXT:    movl 88(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %edi
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    shrl %eax
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    orl %edi, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl 92(%esp,%eax), %edi
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    shrl %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    orl %edi, %ebp
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl %edx, (%eax)
; FALLBACK28-NEXT:    movl %ebp, 28(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
; FALLBACK28-NEXT:    movl %ebx, 4(%eax)
; FALLBACK28-NEXT:    movl %esi, 8(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
; FALLBACK28-NEXT:    addl $108, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    vzeroupper
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: shl_32bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $92, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK29-NEXT:    movzbl (%eax), %eax
; FALLBACK29-NEXT:    movl %eax, %ecx
; FALLBACK29-NEXT:    shlb $3, %cl
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    andb $28, %al
; FALLBACK29-NEXT:    negb %al
; FALLBACK29-NEXT:    movsbl %al, %ebp
; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %edx
; FALLBACK29-NEXT:    shldl %cl, %edx, %eax
; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edi
; FALLBACK29-NEXT:    shldl %cl, %edi, %edx
; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %ebx
; FALLBACK29-NEXT:    shldl %cl, %ebx, %edi
; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl %edx, %eax
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK29-NEXT:    shldl %cl, %esi, %eax
; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %ebp
; FALLBACK29-NEXT:    shldl %cl, %edx, %ebp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK29-NEXT:    movl %ebp, 28(%edx)
; FALLBACK29-NEXT:    movl %eax, 24(%edx)
; FALLBACK29-NEXT:    movl %esi, %eax
; FALLBACK29-NEXT:    shll %cl, %eax
; FALLBACK29-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK29-NEXT:    movl %ebx, 4(%edx)
; FALLBACK29-NEXT:    movl %edi, 8(%edx)
; FALLBACK29-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK29-NEXT:    movl %ecx, 12(%edx)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK29-NEXT:    movl %ecx, 16(%edx)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK29-NEXT:    movl %ecx, 20(%edx)
; FALLBACK29-NEXT:    movl %eax, (%edx)
; FALLBACK29-NEXT:    addl $92, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    vzeroupper
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: shl_32bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $108, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK30-NEXT:    movzbl (%eax), %ecx
; FALLBACK30-NEXT:    movl %ecx, %eax
; FALLBACK30-NEXT:    shlb $3, %al
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    andb $28, %cl
; FALLBACK30-NEXT:    negb %cl
; FALLBACK30-NEXT:    movsbl %cl, %edx
; FALLBACK30-NEXT:    movl 84(%esp,%edx), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK30-NEXT:    movl %eax, %ebx
; FALLBACK30-NEXT:    notb %bl
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %ecx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 76(%esp,%edx), %ecx
; FALLBACK30-NEXT:    movl %ecx, %esi
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %edi, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %eax, %ecx, %ecx
; FALLBACK30-NEXT:    movl 72(%esp,%edx), %esi
; FALLBACK30-NEXT:    movl %esi, %edi
; FALLBACK30-NEXT:    shrl %edi
; FALLBACK30-NEXT:    shrxl %ebx, %edi, %edi
; FALLBACK30-NEXT:    orl %ecx, %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %eax, %esi, %ecx
; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
; FALLBACK30-NEXT:    movl %esi, %edi
; FALLBACK30-NEXT:    shrl %edi
; FALLBACK30-NEXT:    shrxl %ebx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ecx, %ebp
; FALLBACK30-NEXT:    shlxl %eax, %esi, %edi
; FALLBACK30-NEXT:    movl 64(%esp,%edx), %esi
; FALLBACK30-NEXT:    movl %esi, %ecx
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %edi, %ecx
; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %eax, 92(%esp,%edx), %edi
; FALLBACK30-NEXT:    movl 88(%esp,%edx), %edx
; FALLBACK30-NEXT:    shlxl %eax, %edx, %esi
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    shrl %eax
; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    shrl %edx
; FALLBACK30-NEXT:    shrxl %ebx, %edx, %edx
; FALLBACK30-NEXT:    orl %edi, %edx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK30-NEXT:    movl %edi, (%esi)
; FALLBACK30-NEXT:    movl %edx, 28(%esi)
; FALLBACK30-NEXT:    movl %eax, 24(%esi)
; FALLBACK30-NEXT:    movl %ecx, 4(%esi)
; FALLBACK30-NEXT:    movl %ebp, 8(%esi)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 12(%esi)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 16(%esi)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 20(%esi)
; FALLBACK30-NEXT:    addl $108, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    vzeroupper
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: shl_32bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $92, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK31-NEXT:    movzbl (%eax), %eax
; FALLBACK31-NEXT:    movl %eax, %ecx
; FALLBACK31-NEXT:    shlb $3, %cl
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    andb $28, %al
; FALLBACK31-NEXT:    negb %al
; FALLBACK31-NEXT:    movsbl %al, %ebx
; FALLBACK31-NEXT:    movl 64(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl 68(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %eax, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %edx
; FALLBACK31-NEXT:    shldl %cl, %edx, %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %edi
; FALLBACK31-NEXT:    shldl %cl, %edi, %edx
; FALLBACK31-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %ebp
; FALLBACK31-NEXT:    shldl %cl, %ebp, %edi
; FALLBACK31-NEXT:    movl 72(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl %edx, %eax
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    shldl %cl, %esi, %eax
; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK31-NEXT:    movl 76(%esp,%ebx), %ebx
; FALLBACK31-NEXT:    shldl %cl, %edx, %ebx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK31-NEXT:    movl %ebx, 28(%edx)
; FALLBACK31-NEXT:    movl %eax, 24(%edx)
; FALLBACK31-NEXT:    shlxl %ecx, %esi, %eax
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    shldl %cl, %esi, %ebp
; FALLBACK31-NEXT:    movl %ebp, 4(%edx)
; FALLBACK31-NEXT:    movl %edi, 8(%edx)
; FALLBACK31-NEXT:    movl (%esp), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 12(%edx)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 16(%edx)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 20(%edx)
; FALLBACK31-NEXT:    movl %eax, (%edx)
; FALLBACK31-NEXT:    addl $92, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    vzeroupper
; FALLBACK31-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %byteOff = load i256, ptr %byteOff.ptr, align 1
  %bitOff = shl i256 %byteOff, 3
  %res = shl i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: shl_32bytes_dwordOff:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    movl %esi, %eax
; FALLBACK0-NEXT:    shlb $5, %al
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    shlb $2, %sil
; FALLBACK0-NEXT:    andb $24, %sil
; FALLBACK0-NEXT:    negb %sil
; FALLBACK0-NEXT:    movsbq %sil, %r10
; FALLBACK0-NEXT:    movq -32(%rsp,%r10), %r8
; FALLBACK0-NEXT:    movq -24(%rsp,%r10), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq %r8, %r9
; FALLBACK0-NEXT:    shrq %r9
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r9
; FALLBACK0-NEXT:    orq %r11, %r9
; FALLBACK0-NEXT:    movq -8(%rsp,%r10), %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    movq -16(%rsp,%r10), %r10
; FALLBACK0-NEXT:    movq %r10, %rbx
; FALLBACK0-NEXT:    shrq %rbx
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    orq %r11, %rbx
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    shrq %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    movq %r8, (%rdx)
; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK0-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: shl_32bytes_dwordOff:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    movl %esi, %ecx
; FALLBACK1-NEXT:    shlb $5, %cl
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    shlb $2, %sil
; FALLBACK1-NEXT:    andb $24, %sil
; FALLBACK1-NEXT:    negb %sil
; FALLBACK1-NEXT:    movsbq %sil, %rax
; FALLBACK1-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK1-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK1-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shldq %cl, %r8, %rax
; FALLBACK1-NEXT:    shlq %cl, %r8
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK1-NEXT:    movq %r8, (%rdx)
; FALLBACK1-NEXT:    movq %rax, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: shl_32bytes_dwordOff:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    movl %esi, %eax
; FALLBACK2-NEXT:    shlb $5, %al
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    shlb $2, %sil
; FALLBACK2-NEXT:    andb $24, %sil
; FALLBACK2-NEXT:    negb %sil
; FALLBACK2-NEXT:    movsbq %sil, %rsi
; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
; FALLBACK2-NEXT:    shlxq %rax, -16(%rsp,%rsi), %r9
; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %r10
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    shrq %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    shrq %rsi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    shrq %rcx
; FALLBACK2-NEXT:    shrxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, (%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: shl_32bytes_dwordOff:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    movl %esi, %ecx
; FALLBACK3-NEXT:    shlb $5, %cl
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    shlb $2, %sil
; FALLBACK3-NEXT:    andb $24, %sil
; FALLBACK3-NEXT:    negb %sil
; FALLBACK3-NEXT:    movsbq %sil, %rax
; FALLBACK3-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK3-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK3-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shldq %cl, %r8, %rax
; FALLBACK3-NEXT:    shlxq %rcx, %r8, %rcx
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK3-NEXT:    movq %rcx, (%rdx)
; FALLBACK3-NEXT:    movq %rax, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: shl_32bytes_dwordOff:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movzbl (%rsi), %ecx
; FALLBACK4-NEXT:    movl %ecx, %eax
; FALLBACK4-NEXT:    shlb $5, %al
; FALLBACK4-NEXT:    xorps %xmm2, %xmm2
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    shlb $2, %cl
; FALLBACK4-NEXT:    andb $24, %cl
; FALLBACK4-NEXT:    negb %cl
; FALLBACK4-NEXT:    movsbq %cl, %r8
; FALLBACK4-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r9
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK4-NEXT:    movq %r10, %rdi
; FALLBACK4-NEXT:    shrq %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rdi
; FALLBACK4-NEXT:    orq %r9, %rdi
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK4-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK4-NEXT:    movq %r8, %r11
; FALLBACK4-NEXT:    shrq %r11
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    orq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r8
; FALLBACK4-NEXT:    movq %r9, %r10
; FALLBACK4-NEXT:    shrq %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, (%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %r11, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: shl_32bytes_dwordOff:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movzbl (%rsi), %eax
; FALLBACK5-NEXT:    movl %eax, %ecx
; FALLBACK5-NEXT:    shlb $5, %cl
; FALLBACK5-NEXT:    xorps %xmm2, %xmm2
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    shlb $2, %al
; FALLBACK5-NEXT:    andb $24, %al
; FALLBACK5-NEXT:    negb %al
; FALLBACK5-NEXT:    movsbq %al, %rax
; FALLBACK5-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK5-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK5-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK5-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK5-NEXT:    movq %r8, %r9
; FALLBACK5-NEXT:    shlq %cl, %r9
; FALLBACK5-NEXT:    shldq %cl, %r8, %rax
; FALLBACK5-NEXT:    movq %rax, 8(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: shl_32bytes_dwordOff:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movzbl (%rsi), %ecx
; FALLBACK6-NEXT:    movl %ecx, %eax
; FALLBACK6-NEXT:    shlb $5, %al
; FALLBACK6-NEXT:    xorps %xmm2, %xmm2
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    shlb $2, %cl
; FALLBACK6-NEXT:    andb $24, %cl
; FALLBACK6-NEXT:    negb %cl
; FALLBACK6-NEXT:    movsbq %cl, %rcx
; FALLBACK6-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK6-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK6-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK6-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK6-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    shrq %rdi
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    shrq %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r8, %rcx
; FALLBACK6-NEXT:    shrq %r9
; FALLBACK6-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, (%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: shl_32bytes_dwordOff:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movzbl (%rsi), %eax
; FALLBACK7-NEXT:    movl %eax, %ecx
; FALLBACK7-NEXT:    shlb $5, %cl
; FALLBACK7-NEXT:    xorps %xmm2, %xmm2
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    shlb $2, %al
; FALLBACK7-NEXT:    andb $24, %al
; FALLBACK7-NEXT:    negb %al
; FALLBACK7-NEXT:    movsbq %al, %rax
; FALLBACK7-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK7-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK7-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK7-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK7-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK7-NEXT:    shldq %cl, %r8, %rax
; FALLBACK7-NEXT:    movq %rax, 8(%rdx)
; FALLBACK7-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: shl_32bytes_dwordOff:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    movzbl (%rsi), %ecx
; FALLBACK8-NEXT:    movl %ecx, %eax
; FALLBACK8-NEXT:    shlb $5, %al
; FALLBACK8-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    shlb $2, %cl
; FALLBACK8-NEXT:    andb $24, %cl
; FALLBACK8-NEXT:    negb %cl
; FALLBACK8-NEXT:    movsbq %cl, %r8
; FALLBACK8-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r9
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK8-NEXT:    movq %r10, %rdi
; FALLBACK8-NEXT:    shrq %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rdi
; FALLBACK8-NEXT:    orq %r9, %rdi
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK8-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK8-NEXT:    movq %r8, %r11
; FALLBACK8-NEXT:    shrq %r11
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    orq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r8
; FALLBACK8-NEXT:    movq %r9, %r10
; FALLBACK8-NEXT:    shrq %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, (%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %r11, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: shl_32bytes_dwordOff:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    movzbl (%rsi), %eax
; FALLBACK9-NEXT:    movl %eax, %ecx
; FALLBACK9-NEXT:    shlb $5, %cl
; FALLBACK9-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    shlb $2, %al
; FALLBACK9-NEXT:    andb $24, %al
; FALLBACK9-NEXT:    negb %al
; FALLBACK9-NEXT:    movsbq %al, %rax
; FALLBACK9-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK9-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK9-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK9-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK9-NEXT:    movq %r8, %r9
; FALLBACK9-NEXT:    shlq %cl, %r9
; FALLBACK9-NEXT:    shldq %cl, %r8, %rax
; FALLBACK9-NEXT:    movq %rax, 8(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: shl_32bytes_dwordOff:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    movzbl (%rsi), %ecx
; FALLBACK10-NEXT:    movl %ecx, %eax
; FALLBACK10-NEXT:    shlb $5, %al
; FALLBACK10-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    shlb $2, %cl
; FALLBACK10-NEXT:    andb $24, %cl
; FALLBACK10-NEXT:    negb %cl
; FALLBACK10-NEXT:    movsbq %cl, %rcx
; FALLBACK10-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK10-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK10-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK10-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK10-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    shrq %rdi
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    shrq %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r8, %rcx
; FALLBACK10-NEXT:    shrq %r9
; FALLBACK10-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, (%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: shl_32bytes_dwordOff:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    movzbl (%rsi), %eax
; FALLBACK11-NEXT:    movl %eax, %ecx
; FALLBACK11-NEXT:    shlb $5, %cl
; FALLBACK11-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    shlb $2, %al
; FALLBACK11-NEXT:    andb $24, %al
; FALLBACK11-NEXT:    negb %al
; FALLBACK11-NEXT:    movsbq %al, %rax
; FALLBACK11-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK11-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK11-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK11-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK11-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK11-NEXT:    shldq %cl, %r8, %rax
; FALLBACK11-NEXT:    movq %rax, 8(%rdx)
; FALLBACK11-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: shl_32bytes_dwordOff:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK12-NEXT:    movzbl (%rsi), %ecx
; FALLBACK12-NEXT:    movl %ecx, %eax
; FALLBACK12-NEXT:    shlb $5, %al
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    shlb $2, %cl
; FALLBACK12-NEXT:    andb $24, %cl
; FALLBACK12-NEXT:    negb %cl
; FALLBACK12-NEXT:    movsbq %cl, %r8
; FALLBACK12-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r9
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    movq -24(%rsp,%r8), %r10
; FALLBACK12-NEXT:    movq %r10, %rdi
; FALLBACK12-NEXT:    shrq %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rdi
; FALLBACK12-NEXT:    orq %r9, %rdi
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK12-NEXT:    movq -32(%rsp,%r8), %r8
; FALLBACK12-NEXT:    movq %r8, %r11
; FALLBACK12-NEXT:    shrq %r11
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    orq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r8
; FALLBACK12-NEXT:    movq %r9, %r10
; FALLBACK12-NEXT:    shrq %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, (%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %r11, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: shl_32bytes_dwordOff:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK13-NEXT:    movzbl (%rsi), %eax
; FALLBACK13-NEXT:    movl %eax, %ecx
; FALLBACK13-NEXT:    shlb $5, %cl
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    shlb $2, %al
; FALLBACK13-NEXT:    andb $24, %al
; FALLBACK13-NEXT:    negb %al
; FALLBACK13-NEXT:    movsbq %al, %rax
; FALLBACK13-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK13-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK13-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK13-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK13-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK13-NEXT:    movq %r8, %r9
; FALLBACK13-NEXT:    shlq %cl, %r9
; FALLBACK13-NEXT:    shldq %cl, %r8, %rax
; FALLBACK13-NEXT:    movq %rax, 8(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: shl_32bytes_dwordOff:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK14-NEXT:    movzbl (%rsi), %ecx
; FALLBACK14-NEXT:    movl %ecx, %eax
; FALLBACK14-NEXT:    shlb $5, %al
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    shlb $2, %cl
; FALLBACK14-NEXT:    andb $24, %cl
; FALLBACK14-NEXT:    negb %cl
; FALLBACK14-NEXT:    movsbq %cl, %rcx
; FALLBACK14-NEXT:    shlxq %rax, -16(%rsp,%rcx), %rsi
; FALLBACK14-NEXT:    movq -24(%rsp,%rcx), %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %r8
; FALLBACK14-NEXT:    movq -40(%rsp,%rcx), %r9
; FALLBACK14-NEXT:    movq -32(%rsp,%rcx), %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %r10
; FALLBACK14-NEXT:    shlxq %rax, %r9, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    shrq %rdi
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    shrq %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r8, %rcx
; FALLBACK14-NEXT:    shrq %r9
; FALLBACK14-NEXT:    shrxq %rax, %r9, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, (%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: shl_32bytes_dwordOff:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK15-NEXT:    movzbl (%rsi), %eax
; FALLBACK15-NEXT:    movl %eax, %ecx
; FALLBACK15-NEXT:    shlb $5, %cl
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    shlb $2, %al
; FALLBACK15-NEXT:    andb $24, %al
; FALLBACK15-NEXT:    negb %al
; FALLBACK15-NEXT:    movsbq %al, %rax
; FALLBACK15-NEXT:    movq -24(%rsp,%rax), %rsi
; FALLBACK15-NEXT:    movq -16(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    shldq %cl, %rsi, %rdi
; FALLBACK15-NEXT:    movq -40(%rsp,%rax), %r8
; FALLBACK15-NEXT:    movq -32(%rsp,%rax), %rax
; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK15-NEXT:    shlxq %rcx, %r8, %r9
; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK15-NEXT:    shldq %cl, %r8, %rax
; FALLBACK15-NEXT:    movq %rax, 8(%rdx)
; FALLBACK15-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; X86-SSE2-LABEL: shl_32bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; X86-SSE2-NEXT:    movl (%ebp), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%ebp), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%ebp), %esi
; X86-SSE2-NEXT:    movl 12(%ebp), %edi
; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
; X86-SSE2-NEXT:    movl 20(%ebp), %edx
; X86-SSE2-NEXT:    movl 24(%ebp), %eax
; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    shlb $2, %cl
; X86-SSE2-NEXT:    andb $28, %cl
; X86-SSE2-NEXT:    negb %cl
; X86-SSE2-NEXT:    movsbl %cl, %edx
; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %edx, 24(%eax)
; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: shl_32bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $76, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    shlb $2, %cl
; X86-SSE42-NEXT:    andb $28, %cl
; X86-SSE42-NEXT:    negb %cl
; X86-SSE42-NEXT:    movsbl %cl, %ecx
; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $76, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: shl_32bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $76, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %ymm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    shlb $2, %cl
; X86-AVX-NEXT:    andb $28, %cl
; X86-AVX-NEXT:    negb %cl
; X86-AVX-NEXT:    movsbl %cl, %ecx
; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $76, %esp
; X86-AVX-NEXT:    vzeroupper
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
  %bitOff = shl i256 %dwordOff, 5
  %res = shl i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: shl_32bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
; X64-SSE2-NEXT:    movzbl (%rsi), %esi
; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    shlb $3, %sil
; X64-SSE2-NEXT:    andb $24, %sil
; X64-SSE2-NEXT:    negb %sil
; X64-SSE2-NEXT:    movsbq %sil, %rax
; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rcx
; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %rsi
; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rdi
; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %rax
; X64-SSE2-NEXT:    movq %rax, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rcx, (%rdx)
; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: shl_32bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
; X64-SSE42-NEXT:    movzbl (%rsi), %eax
; X64-SSE42-NEXT:    xorps %xmm2, %xmm2
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    shlb $3, %al
; X64-SSE42-NEXT:    andb $24, %al
; X64-SSE42-NEXT:    negb %al
; X64-SSE42-NEXT:    movsbq %al, %rax
; X64-SSE42-NEXT:    movups -40(%rsp,%rax), %xmm0
; X64-SSE42-NEXT:    movups -24(%rsp,%rax), %xmm1
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    retq
;
; X64-AVX-LABEL: shl_32bytes_qwordOff:
; X64-AVX:       # %bb.0:
; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
; X64-AVX-NEXT:    movzbl (%rsi), %eax
; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    shlb $3, %al
; X64-AVX-NEXT:    andb $24, %al
; X64-AVX-NEXT:    negb %al
; X64-AVX-NEXT:    movsbq %al, %rax
; X64-AVX-NEXT:    vmovups -40(%rsp,%rax), %xmm0
; X64-AVX-NEXT:    vmovups -24(%rsp,%rax), %xmm1
; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX-NEXT:    vzeroupper
; X64-AVX-NEXT:    retq
;
; X86-SSE2-LABEL: shl_32bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; X86-SSE2-NEXT:    movl (%ebp), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%ebp), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%ebp), %esi
; X86-SSE2-NEXT:    movl 12(%ebp), %edi
; X86-SSE2-NEXT:    movl 16(%ebp), %ebx
; X86-SSE2-NEXT:    movzbl (%ecx), %ecx
; X86-SSE2-NEXT:    movl 20(%ebp), %edx
; X86-SSE2-NEXT:    movl 24(%ebp), %eax
; X86-SSE2-NEXT:    movl 28(%ebp), %ebp
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    shlb $3, %cl
; X86-SSE2-NEXT:    andb $24, %cl
; X86-SSE2-NEXT:    negb %cl
; X86-SSE2-NEXT:    movsbl %cl, %edx
; X86-SSE2-NEXT:    movl 48(%esp,%edx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 52(%esp,%edx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 60(%esp,%edx), %esi
; X86-SSE2-NEXT:    movl 56(%esp,%edx), %edi
; X86-SSE2-NEXT:    movl 68(%esp,%edx), %ebx
; X86-SSE2-NEXT:    movl 64(%esp,%edx), %ebp
; X86-SSE2-NEXT:    movl 76(%esp,%edx), %ecx
; X86-SSE2-NEXT:    movl 72(%esp,%edx), %edx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %edx, 24(%eax)
; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: shl_32bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $76, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm2, %xmm2
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, (%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    shlb $3, %cl
; X86-SSE42-NEXT:    andb $24, %cl
; X86-SSE42-NEXT:    negb %cl
; X86-SSE42-NEXT:    movsbl %cl, %ecx
; X86-SSE42-NEXT:    movups 32(%esp,%ecx), %xmm0
; X86-SSE42-NEXT:    movups 48(%esp,%ecx), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $76, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: shl_32bytes_qwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    subl $76, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %ymm0
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT:    vmovups %ymm1, (%esp)
; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    shlb $3, %cl
; X86-AVX-NEXT:    andb $24, %cl
; X86-AVX-NEXT:    negb %cl
; X86-AVX-NEXT:    movsbl %cl, %ecx
; X86-AVX-NEXT:    vmovups 32(%esp,%ecx), %xmm0
; X86-AVX-NEXT:    vmovups 48(%esp,%ecx), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $76, %esp
; X86-AVX-NEXT:    vzeroupper
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
  %bitOff = shl i256 %qwordOff, 6
  %res = shl i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: ashr_32bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    sarq $63, %rdi
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    andb $24, %sil
; FALLBACK0-NEXT:    movzbl %sil, %r9d
; FALLBACK0-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK0-NEXT:    movq -56(%rsp,%r9), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -48(%rsp,%r9), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r11, %r8
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %rdi, %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    sarq %cl, %r9
; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %rdi, (%rdx)
; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: ashr_32bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    sarq $63, %rdi
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    andb $24, %sil
; FALLBACK1-NEXT:    movzbl %sil, %eax
; FALLBACK1-NEXT:    movq -56(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    movq -64(%rsp,%rax), %r8
; FALLBACK1-NEXT:    movq %r8, %r9
; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK1-NEXT:    movq -48(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    sarq %cl, %rax
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
; FALLBACK1-NEXT:    movq %rdi, (%rdx)
; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: ashr_32bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    sarq $63, %rdi
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    andb $24, %sil
; FALLBACK2-NEXT:    movzbl %sil, %ecx
; FALLBACK2-NEXT:    movq -64(%rsp,%rcx), %rsi
; FALLBACK2-NEXT:    movq -56(%rsp,%rcx), %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx), %r9
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK2-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    addq %rcx, %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, (%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: ashr_32bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    sarq $63, %rdi
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    andb $24, %sil
; FALLBACK3-NEXT:    movzbl %sil, %eax
; FALLBACK3-NEXT:    movq -56(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    movq -64(%rsp,%rax), %r8
; FALLBACK3-NEXT:    movq %r8, %r9
; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK3-NEXT:    movq -48(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
; FALLBACK3-NEXT:    movq %rdi, (%rdx)
; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: ashr_32bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movq 16(%rdi), %rcx
; FALLBACK4-NEXT:    movq 24(%rdi), %rdi
; FALLBACK4-NEXT:    movzbl (%rsi), %esi
; FALLBACK4-NEXT:    leal (,%rsi,8), %eax
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    sarq $63, %rdi
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    andb $24, %sil
; FALLBACK4-NEXT:    movzbl %sil, %r9d
; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rdi
; FALLBACK4-NEXT:    orq %r10, %rdi
; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK4-NEXT:    movq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r11, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    sarq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, (%rdx)
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: ashr_32bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movq 16(%rdi), %rax
; FALLBACK5-NEXT:    movq 24(%rdi), %rdi
; FALLBACK5-NEXT:    movzbl (%rsi), %esi
; FALLBACK5-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    sarq $63, %rdi
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    andb $24, %sil
; FALLBACK5-NEXT:    movzbl %sil, %eax
; FALLBACK5-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK5-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    movq %rdi, %r8
; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK5-NEXT:    movq %rax, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    sarq %cl, %rsi
; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: ashr_32bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movq 16(%rdi), %rcx
; FALLBACK6-NEXT:    movq 24(%rdi), %rdi
; FALLBACK6-NEXT:    movzbl (%rsi), %esi
; FALLBACK6-NEXT:    leal (,%rsi,8), %eax
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    sarq $63, %rdi
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    andb $24, %sil
; FALLBACK6-NEXT:    movzbl %sil, %ecx
; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK6-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK6-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK6-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r9, %rcx
; FALLBACK6-NEXT:    addq %r8, %r8
; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, (%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: ashr_32bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movq 16(%rdi), %rax
; FALLBACK7-NEXT:    movq 24(%rdi), %rdi
; FALLBACK7-NEXT:    movzbl (%rsi), %esi
; FALLBACK7-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    sarq $63, %rdi
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    andb $24, %sil
; FALLBACK7-NEXT:    movzbl %sil, %eax
; FALLBACK7-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK7-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    movq %rdi, %r8
; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK7-NEXT:    movq %rax, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK7-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: ashr_32bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK8-NEXT:    movq 16(%rdi), %rcx
; FALLBACK8-NEXT:    movq 24(%rdi), %rdi
; FALLBACK8-NEXT:    movzbl (%rsi), %esi
; FALLBACK8-NEXT:    leal (,%rsi,8), %eax
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    sarq $63, %rdi
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    andb $24, %sil
; FALLBACK8-NEXT:    movzbl %sil, %r9d
; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rdi
; FALLBACK8-NEXT:    orq %r10, %rdi
; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r11, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    sarq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, (%rdx)
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: ashr_32bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK9-NEXT:    movq 16(%rdi), %rax
; FALLBACK9-NEXT:    movq 24(%rdi), %rdi
; FALLBACK9-NEXT:    movzbl (%rsi), %esi
; FALLBACK9-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    sarq $63, %rdi
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    andb $24, %sil
; FALLBACK9-NEXT:    movzbl %sil, %eax
; FALLBACK9-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK9-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    movq %rdi, %r8
; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK9-NEXT:    movq %rax, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    sarq %cl, %rsi
; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: ashr_32bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK10-NEXT:    movq 16(%rdi), %rcx
; FALLBACK10-NEXT:    movq 24(%rdi), %rdi
; FALLBACK10-NEXT:    movzbl (%rsi), %esi
; FALLBACK10-NEXT:    leal (,%rsi,8), %eax
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    sarq $63, %rdi
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    andb $24, %sil
; FALLBACK10-NEXT:    movzbl %sil, %ecx
; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK10-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK10-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK10-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r9, %rcx
; FALLBACK10-NEXT:    addq %r8, %r8
; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, (%rdx)
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: ashr_32bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK11-NEXT:    movq 16(%rdi), %rax
; FALLBACK11-NEXT:    movq 24(%rdi), %rdi
; FALLBACK11-NEXT:    movzbl (%rsi), %esi
; FALLBACK11-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    sarq $63, %rdi
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    andb $24, %sil
; FALLBACK11-NEXT:    movzbl %sil, %eax
; FALLBACK11-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK11-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    movq %rdi, %r8
; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK11-NEXT:    movq %rax, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK11-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: ashr_32bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK12-NEXT:    movq 16(%rdi), %rcx
; FALLBACK12-NEXT:    movq 24(%rdi), %rdi
; FALLBACK12-NEXT:    movzbl (%rsi), %esi
; FALLBACK12-NEXT:    leal (,%rsi,8), %eax
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    sarq $63, %rdi
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    andb $24, %sil
; FALLBACK12-NEXT:    movzbl %sil, %r9d
; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rdi
; FALLBACK12-NEXT:    orq %r10, %rdi
; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %r9
; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r11, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    sarq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, (%rdx)
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: ashr_32bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK13-NEXT:    movq 16(%rdi), %rax
; FALLBACK13-NEXT:    movq 24(%rdi), %rdi
; FALLBACK13-NEXT:    movzbl (%rsi), %esi
; FALLBACK13-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    sarq $63, %rdi
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    andb $24, %sil
; FALLBACK13-NEXT:    movzbl %sil, %eax
; FALLBACK13-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK13-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK13-NEXT:    movq %rdi, %r8
; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK13-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK13-NEXT:    movq %rax, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    sarq %cl, %rsi
; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: ashr_32bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK14-NEXT:    movq 16(%rdi), %rcx
; FALLBACK14-NEXT:    movq 24(%rdi), %rdi
; FALLBACK14-NEXT:    movzbl (%rsi), %esi
; FALLBACK14-NEXT:    leal (,%rsi,8), %eax
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    sarq $63, %rdi
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    andb $24, %sil
; FALLBACK14-NEXT:    movzbl %sil, %ecx
; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx), %rsi
; FALLBACK14-NEXT:    movq -64(%rsp,%rcx), %rdi
; FALLBACK14-NEXT:    movq -56(%rsp,%rcx), %r8
; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK14-NEXT:    movq -48(%rsp,%rcx), %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    addq %rcx, %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r9, %rcx
; FALLBACK14-NEXT:    addq %r8, %r8
; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, (%rdx)
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: ashr_32bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK15-NEXT:    movq 16(%rdi), %rax
; FALLBACK15-NEXT:    movq 24(%rdi), %rdi
; FALLBACK15-NEXT:    movzbl (%rsi), %esi
; FALLBACK15-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    sarq $63, %rdi
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    andb $24, %sil
; FALLBACK15-NEXT:    movzbl %sil, %eax
; FALLBACK15-NEXT:    movq -48(%rsp,%rax), %rsi
; FALLBACK15-NEXT:    movq -56(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    movq %rdi, %r8
; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq -64(%rsp,%rax), %rax
; FALLBACK15-NEXT:    movq %rax, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK15-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: ashr_32bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $108, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK16-NEXT:    movl (%esi), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%esi), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%esi), %ebx
; FALLBACK16-NEXT:    movl 12(%esi), %ebp
; FALLBACK16-NEXT:    movl 16(%esi), %edi
; FALLBACK16-NEXT:    movzbl (%eax), %ecx
; FALLBACK16-NEXT:    movl 20(%esi), %edx
; FALLBACK16-NEXT:    movl 24(%esi), %eax
; FALLBACK16-NEXT:    movl 28(%esi), %esi
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, %edx
; FALLBACK16-NEXT:    shlb $3, %dl
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    sarl $31, %esi
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    andb $28, %cl
; FALLBACK16-NEXT:    movzbl %cl, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
; FALLBACK16-NEXT:    movl %eax, %ebx
; FALLBACK16-NEXT:    movl %edx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movb %dl, %ch
; FALLBACK16-NEXT:    notb %ch
; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %ebx, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    addl %eax, %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %esi, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
; FALLBACK16-NEXT:    movl %ebp, %esi
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    movl %edx, %ebx
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    movl 48(%esp,%eax), %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %esi, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %ebx, %edx
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    addl %ebp, %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %edi, %ebp
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK16-NEXT:    movl 52(%esp,%esi), %edi
; FALLBACK16-NEXT:    movl %edi, %eax
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 56(%esp,%esi), %ebx
; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    orl %eax, %esi
; FALLBACK16-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    addl %edi, %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %eax, %edi
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl 60(%esp,%eax), %eax
; FALLBACK16-NEXT:    leal (%eax,%eax), %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %ebx, %edx
; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; FALLBACK16-NEXT:    sarl %cl, %eax
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
; FALLBACK16-NEXT:    movl %edx, 24(%ecx)
; FALLBACK16-NEXT:    movl %edi, 16(%ecx)
; FALLBACK16-NEXT:    movl %esi, 20(%ecx)
; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, (%ecx)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
; FALLBACK16-NEXT:    addl $108, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: ashr_32bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $92, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%ecx), %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%ecx), %edx
; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 12(%ecx), %ebp
; FALLBACK17-NEXT:    movl 16(%ecx), %ebx
; FALLBACK17-NEXT:    movzbl (%eax), %eax
; FALLBACK17-NEXT:    movl 20(%ecx), %edi
; FALLBACK17-NEXT:    movl 24(%ecx), %edx
; FALLBACK17-NEXT:    movl 28(%ecx), %esi
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, %ecx
; FALLBACK17-NEXT:    shlb $3, %cl
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    sarl $31, %esi
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    andb $28, %al
; FALLBACK17-NEXT:    movzbl %al, %ebp
; FALLBACK17-NEXT:    movl 24(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 20(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%esp,%ebp), %ebx
; FALLBACK17-NEXT:    movl 28(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %esi
; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 36(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK17-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK17-NEXT:    movl 16(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl %edx, 24(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
; FALLBACK17-NEXT:    sarl %cl, %eax
; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
; FALLBACK17-NEXT:    movl %ebx, 16(%ebp)
; FALLBACK17-NEXT:    movl %edi, 20(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
; FALLBACK17-NEXT:    movl %esi, (%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
; FALLBACK17-NEXT:    addl $92, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: ashr_32bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $108, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %esi
; FALLBACK18-NEXT:    movl (%esi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%esi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%esi), %ebx
; FALLBACK18-NEXT:    movl 12(%esi), %ebp
; FALLBACK18-NEXT:    movl 16(%esi), %edi
; FALLBACK18-NEXT:    movzbl (%ecx), %ecx
; FALLBACK18-NEXT:    movl 20(%esi), %edx
; FALLBACK18-NEXT:    movl 24(%esi), %eax
; FALLBACK18-NEXT:    movl 28(%esi), %esi
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, %eax
; FALLBACK18-NEXT:    shlb $3, %al
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    sarl $31, %esi
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    andb $28, %cl
; FALLBACK18-NEXT:    movzbl %cl, %edi
; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
; FALLBACK18-NEXT:    movl %eax, %edx
; FALLBACK18-NEXT:    notb %dl
; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
; FALLBACK18-NEXT:    shlxl %edx, %ebp, %ebp
; FALLBACK18-NEXT:    orl %ebx, %ebp
; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %eax, 32(%esp,%edi), %ebx
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK18-NEXT:    orl %ebx, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 48(%esp,%edi), %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK18-NEXT:    shlxl %edx, %ebx, %esi
; FALLBACK18-NEXT:    movl 44(%esp,%edi), %ebp
; FALLBACK18-NEXT:    shrxl %eax, %ebp, %ebx
; FALLBACK18-NEXT:    orl %ebx, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %eax, %ecx, %ecx
; FALLBACK18-NEXT:    movl %eax, %ebx
; FALLBACK18-NEXT:    addl %ebp, %ebp
; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
; FALLBACK18-NEXT:    orl %ecx, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 56(%esp,%edi), %ebp
; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK18-NEXT:    movl 52(%esp,%edi), %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %esi
; FALLBACK18-NEXT:    orl %esi, %ecx
; FALLBACK18-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    addl %eax, %eax
; FALLBACK18-NEXT:    shlxl %edx, %eax, %esi
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrxl %ebx, %ebp, %eax
; FALLBACK18-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK18-NEXT:    sarxl %ebx, %edi, %ebx
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %edx, %edi, %edx
; FALLBACK18-NEXT:    orl %eax, %edx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl %ebx, 28(%eax)
; FALLBACK18-NEXT:    movl %edx, 24(%eax)
; FALLBACK18-NEXT:    movl %esi, 16(%eax)
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, (%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $108, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: ashr_32bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $92, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%ecx), %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%ecx), %edx
; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 12(%ecx), %ebp
; FALLBACK19-NEXT:    movl 16(%ecx), %ebx
; FALLBACK19-NEXT:    movzbl (%eax), %eax
; FALLBACK19-NEXT:    movl 20(%ecx), %edi
; FALLBACK19-NEXT:    movl 24(%ecx), %edx
; FALLBACK19-NEXT:    movl 28(%ecx), %esi
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, %ecx
; FALLBACK19-NEXT:    shlb $3, %cl
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    sarl $31, %esi
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    andb $28, %al
; FALLBACK19-NEXT:    movzbl %al, %ebp
; FALLBACK19-NEXT:    movl 24(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 20(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %esi, %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%esp,%ebp), %ebx
; FALLBACK19-NEXT:    movl 28(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl 36(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl %edx, %esi
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK19-NEXT:    movl 16(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl 44(%esp,%ebp), %edi
; FALLBACK19-NEXT:    shrdl %cl, %edi, %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
; FALLBACK19-NEXT:    sarxl %ecx, %edi, %eax
; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
; FALLBACK19-NEXT:    movl %ebx, 16(%ebp)
; FALLBACK19-NEXT:    movl %esi, 20(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    movl %edx, (%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 4(%ebp)
; FALLBACK19-NEXT:    addl $92, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: ashr_32bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $108, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movl 16(%ecx), %esi
; FALLBACK20-NEXT:    movl 20(%ecx), %edi
; FALLBACK20-NEXT:    movl 24(%ecx), %ebx
; FALLBACK20-NEXT:    movl 28(%ecx), %edx
; FALLBACK20-NEXT:    movzbl (%eax), %eax
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shlb $3, %cl
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    sarl $31, %edx
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    andb $28, %al
; FALLBACK20-NEXT:    movzbl %al, %edi
; FALLBACK20-NEXT:    movl 32(%esp,%edi), %eax
; FALLBACK20-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    movl %ecx, %edx
; FALLBACK20-NEXT:    movb %cl, %dh
; FALLBACK20-NEXT:    notb %dl
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %eax, %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %eax
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    movl 48(%esp,%edi), %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %eax, %esi
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK20-NEXT:    movl %esi, %eax
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %eax, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK20-NEXT:    movl %ebp, %eax
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %eax, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    addl %ebp, %ebp
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %eax, %ebp
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl 60(%esp,%edi), %eax
; FALLBACK20-NEXT:    leal (%eax,%eax), %edi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    orl %ebx, %edi
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movl %edx, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movb %dh, %cl
; FALLBACK20-NEXT:    sarl %cl, %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
; FALLBACK20-NEXT:    movl %esi, 4(%ecx)
; FALLBACK20-NEXT:    movl %edi, 24(%ecx)
; FALLBACK20-NEXT:    movl %ebp, 16(%ecx)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movl %eax, (%ecx)
; FALLBACK20-NEXT:    addl $108, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: ashr_32bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $108, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movups (%ecx), %xmm0
; FALLBACK21-NEXT:    movl 16(%ecx), %esi
; FALLBACK21-NEXT:    movl 20(%ecx), %edi
; FALLBACK21-NEXT:    movl 24(%ecx), %ebx
; FALLBACK21-NEXT:    movl 28(%ecx), %edx
; FALLBACK21-NEXT:    movzbl (%eax), %eax
; FALLBACK21-NEXT:    movl %eax, %ecx
; FALLBACK21-NEXT:    shlb $3, %cl
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    sarl $31, %edx
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    andb $28, %al
; FALLBACK21-NEXT:    movzbl %al, %ebp
; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK21-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK21-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl %edi, %esi
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK21-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %esi, 4(%ebp)
; FALLBACK21-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK21-NEXT:    sarl %cl, %eax
; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
; FALLBACK21-NEXT:    movl %edx, (%ebp)
; FALLBACK21-NEXT:    addl $108, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: ashr_32bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $108, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movl 16(%ecx), %esi
; FALLBACK22-NEXT:    movl 20(%ecx), %edi
; FALLBACK22-NEXT:    movl 24(%ecx), %ebx
; FALLBACK22-NEXT:    movl 28(%ecx), %edx
; FALLBACK22-NEXT:    movzbl (%eax), %ecx
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    shlb $3, %al
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    sarl $31, %edx
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    andb $28, %cl
; FALLBACK22-NEXT:    movzbl %cl, %edi
; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
; FALLBACK22-NEXT:    movl %eax, %edx
; FALLBACK22-NEXT:    notb %dl
; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK22-NEXT:    orl %ebx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK22-NEXT:    movl %eax, %ecx
; FALLBACK22-NEXT:    orl %ebx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK22-NEXT:    shlxl %edx, %ebx, %eax
; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %ebx, %ebx
; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
; FALLBACK22-NEXT:    orl %ebp, %ebx
; FALLBACK22-NEXT:    shrxl %ecx, %esi, %ecx
; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK22-NEXT:    sarxl %eax, %edi, %eax
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
; FALLBACK22-NEXT:    orl %ecx, %edi
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    addl %ecx, %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %esi, %ecx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK22-NEXT:    movl %eax, 28(%edx)
; FALLBACK22-NEXT:    movl %ecx, 4(%edx)
; FALLBACK22-NEXT:    movl %edi, 24(%edx)
; FALLBACK22-NEXT:    movl %ebx, 16(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 20(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 8(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, 12(%edx)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    movl %eax, (%edx)
; FALLBACK22-NEXT:    addl $108, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: ashr_32bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $108, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movups (%ecx), %xmm0
; FALLBACK23-NEXT:    movl 16(%ecx), %esi
; FALLBACK23-NEXT:    movl 20(%ecx), %edi
; FALLBACK23-NEXT:    movl 24(%ecx), %ebx
; FALLBACK23-NEXT:    movl 28(%ecx), %edx
; FALLBACK23-NEXT:    movzbl (%eax), %eax
; FALLBACK23-NEXT:    movl %eax, %ecx
; FALLBACK23-NEXT:    shlb $3, %cl
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    sarl $31, %edx
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    andb $28, %al
; FALLBACK23-NEXT:    movzbl %al, %ebx
; FALLBACK23-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK23-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK23-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, %edi
; FALLBACK23-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK23-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK23-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK23-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl %ebx, 4(%eax)
; FALLBACK23-NEXT:    movl %ebp, 24(%eax)
; FALLBACK23-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK23-NEXT:    movl %ebx, 28(%eax)
; FALLBACK23-NEXT:    movl %esi, 16(%eax)
; FALLBACK23-NEXT:    movl %edi, 20(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    movl %esi, 8(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    movl %esi, 12(%eax)
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, (%eax)
; FALLBACK23-NEXT:    addl $108, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: ashr_32bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $108, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK24-NEXT:    movl 16(%ecx), %esi
; FALLBACK24-NEXT:    movl 20(%ecx), %edi
; FALLBACK24-NEXT:    movl 24(%ecx), %ebx
; FALLBACK24-NEXT:    movl 28(%ecx), %edx
; FALLBACK24-NEXT:    movzbl (%eax), %eax
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shlb $3, %cl
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    sarl $31, %edx
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    andb $28, %al
; FALLBACK24-NEXT:    movzbl %al, %edi
; FALLBACK24-NEXT:    movl 32(%esp,%edi), %eax
; FALLBACK24-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    movl %ecx, %edx
; FALLBACK24-NEXT:    movb %cl, %dh
; FALLBACK24-NEXT:    notb %dl
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %eax, %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %eax
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    movl 48(%esp,%edi), %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %eax, %esi
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK24-NEXT:    movl %esi, %eax
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %eax, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK24-NEXT:    movl %ebp, %eax
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %eax, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    addl %ebp, %ebp
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %eax, %ebp
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl 60(%esp,%edi), %eax
; FALLBACK24-NEXT:    leal (%eax,%eax), %edi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    orl %ebx, %edi
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movl %edx, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movb %dh, %cl
; FALLBACK24-NEXT:    sarl %cl, %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
; FALLBACK24-NEXT:    movl %esi, 4(%ecx)
; FALLBACK24-NEXT:    movl %edi, 24(%ecx)
; FALLBACK24-NEXT:    movl %ebp, 16(%ecx)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movl %eax, (%ecx)
; FALLBACK24-NEXT:    addl $108, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: ashr_32bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $108, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK25-NEXT:    movl 16(%ecx), %esi
; FALLBACK25-NEXT:    movl 20(%ecx), %edi
; FALLBACK25-NEXT:    movl 24(%ecx), %ebx
; FALLBACK25-NEXT:    movl 28(%ecx), %edx
; FALLBACK25-NEXT:    movzbl (%eax), %eax
; FALLBACK25-NEXT:    movl %eax, %ecx
; FALLBACK25-NEXT:    shlb $3, %cl
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    sarl $31, %edx
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    andb $28, %al
; FALLBACK25-NEXT:    movzbl %al, %ebp
; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK25-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK25-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl %edi, %esi
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK25-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %esi, 4(%ebp)
; FALLBACK25-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK25-NEXT:    sarl %cl, %eax
; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
; FALLBACK25-NEXT:    movl %edx, (%ebp)
; FALLBACK25-NEXT:    addl $108, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: ashr_32bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $108, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK26-NEXT:    movl 16(%ecx), %esi
; FALLBACK26-NEXT:    movl 20(%ecx), %edi
; FALLBACK26-NEXT:    movl 24(%ecx), %ebx
; FALLBACK26-NEXT:    movl 28(%ecx), %edx
; FALLBACK26-NEXT:    movzbl (%eax), %ecx
; FALLBACK26-NEXT:    movl %ecx, %eax
; FALLBACK26-NEXT:    shlb $3, %al
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    sarl $31, %edx
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    andb $28, %cl
; FALLBACK26-NEXT:    movzbl %cl, %edi
; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
; FALLBACK26-NEXT:    movl %eax, %edx
; FALLBACK26-NEXT:    notb %dl
; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK26-NEXT:    orl %ecx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK26-NEXT:    orl %ebx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK26-NEXT:    movl %eax, %ecx
; FALLBACK26-NEXT:    orl %ebx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK26-NEXT:    shlxl %edx, %ebx, %eax
; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl %ecx, %eax
; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %ebx, %ebx
; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
; FALLBACK26-NEXT:    orl %ebp, %ebx
; FALLBACK26-NEXT:    shrxl %ecx, %esi, %ecx
; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK26-NEXT:    sarxl %eax, %edi, %eax
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
; FALLBACK26-NEXT:    orl %ecx, %edi
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %esi, %ecx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK26-NEXT:    movl %eax, 28(%edx)
; FALLBACK26-NEXT:    movl %ecx, 4(%edx)
; FALLBACK26-NEXT:    movl %edi, 24(%edx)
; FALLBACK26-NEXT:    movl %ebx, 16(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 20(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 8(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 12(%edx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, (%edx)
; FALLBACK26-NEXT:    addl $108, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: ashr_32bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $108, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK27-NEXT:    movl 16(%ecx), %esi
; FALLBACK27-NEXT:    movl 20(%ecx), %edi
; FALLBACK27-NEXT:    movl 24(%ecx), %ebx
; FALLBACK27-NEXT:    movl 28(%ecx), %edx
; FALLBACK27-NEXT:    movzbl (%eax), %eax
; FALLBACK27-NEXT:    movl %eax, %ecx
; FALLBACK27-NEXT:    shlb $3, %cl
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    sarl $31, %edx
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    andb $28, %al
; FALLBACK27-NEXT:    movzbl %al, %ebx
; FALLBACK27-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK27-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK27-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, %edi
; FALLBACK27-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK27-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK27-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK27-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK27-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl %ebx, 4(%eax)
; FALLBACK27-NEXT:    movl %ebp, 24(%eax)
; FALLBACK27-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK27-NEXT:    movl %ebx, 28(%eax)
; FALLBACK27-NEXT:    movl %esi, 16(%eax)
; FALLBACK27-NEXT:    movl %edi, 20(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    movl %esi, 8(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    movl %esi, 12(%eax)
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, (%eax)
; FALLBACK27-NEXT:    addl $108, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: ashr_32bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $108, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK28-NEXT:    movl 16(%ecx), %esi
; FALLBACK28-NEXT:    movl 20(%ecx), %edi
; FALLBACK28-NEXT:    movl 24(%ecx), %ebx
; FALLBACK28-NEXT:    movl 28(%ecx), %edx
; FALLBACK28-NEXT:    movzbl (%eax), %eax
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shlb $3, %cl
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    sarl $31, %edx
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    andb $28, %al
; FALLBACK28-NEXT:    movzbl %al, %edi
; FALLBACK28-NEXT:    movl 32(%esp,%edi), %eax
; FALLBACK28-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    movl %ecx, %edx
; FALLBACK28-NEXT:    movb %cl, %dh
; FALLBACK28-NEXT:    notb %dl
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %eax, %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %eax
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    movl 48(%esp,%edi), %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %eax, %esi
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
; FALLBACK28-NEXT:    movl %esi, %eax
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %eax, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
; FALLBACK28-NEXT:    movl %ebp, %eax
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %eax, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    addl %ebp, %ebp
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %eax, %ebp
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl 60(%esp,%edi), %eax
; FALLBACK28-NEXT:    leal (%eax,%eax), %edi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    orl %ebx, %edi
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movl %edx, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movb %dh, %cl
; FALLBACK28-NEXT:    sarl %cl, %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
; FALLBACK28-NEXT:    movl %esi, 4(%ecx)
; FALLBACK28-NEXT:    movl %edi, 24(%ecx)
; FALLBACK28-NEXT:    movl %ebp, 16(%ecx)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movl %eax, (%ecx)
; FALLBACK28-NEXT:    addl $108, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: ashr_32bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $108, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK29-NEXT:    movl 16(%ecx), %esi
; FALLBACK29-NEXT:    movl 20(%ecx), %edi
; FALLBACK29-NEXT:    movl 24(%ecx), %ebx
; FALLBACK29-NEXT:    movl 28(%ecx), %edx
; FALLBACK29-NEXT:    movzbl (%eax), %eax
; FALLBACK29-NEXT:    movl %eax, %ecx
; FALLBACK29-NEXT:    shlb $3, %cl
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    sarl $31, %edx
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    andb $28, %al
; FALLBACK29-NEXT:    movzbl %al, %ebp
; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 44(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 40(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %ebx
; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK29-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK29-NEXT:    movl 32(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 36(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl %edi, %esi
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK29-NEXT:    shrdl %cl, %ebp, %esi
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %esi, 4(%ebp)
; FALLBACK29-NEXT:    movl %ebx, 24(%ebp)
; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK29-NEXT:    sarl %cl, %eax
; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
; FALLBACK29-NEXT:    movl %edx, (%ebp)
; FALLBACK29-NEXT:    addl $108, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: ashr_32bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $108, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK30-NEXT:    movl 16(%ecx), %esi
; FALLBACK30-NEXT:    movl 20(%ecx), %edi
; FALLBACK30-NEXT:    movl 24(%ecx), %ebx
; FALLBACK30-NEXT:    movl 28(%ecx), %edx
; FALLBACK30-NEXT:    movzbl (%eax), %ecx
; FALLBACK30-NEXT:    movl %ecx, %eax
; FALLBACK30-NEXT:    shlb $3, %al
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    sarl $31, %edx
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    andb $28, %cl
; FALLBACK30-NEXT:    movzbl %cl, %edi
; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
; FALLBACK30-NEXT:    movl %eax, %edx
; FALLBACK30-NEXT:    notb %dl
; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK30-NEXT:    orl %ecx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK30-NEXT:    orl %ebx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
; FALLBACK30-NEXT:    movl %eax, %ecx
; FALLBACK30-NEXT:    orl %ebx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
; FALLBACK30-NEXT:    shlxl %edx, %ebx, %eax
; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl %ecx, %eax
; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %ebx, %ebx
; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
; FALLBACK30-NEXT:    orl %ebp, %ebx
; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ecx
; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
; FALLBACK30-NEXT:    sarxl %eax, %edi, %eax
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
; FALLBACK30-NEXT:    orl %ecx, %edi
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    addl %ecx, %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %esi, %ecx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %edx
; FALLBACK30-NEXT:    movl %eax, 28(%edx)
; FALLBACK30-NEXT:    movl %ecx, 4(%edx)
; FALLBACK30-NEXT:    movl %edi, 24(%edx)
; FALLBACK30-NEXT:    movl %ebx, 16(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 20(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 8(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 12(%edx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, (%edx)
; FALLBACK30-NEXT:    addl $108, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: ashr_32bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $108, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    vmovups (%ecx), %xmm0
; FALLBACK31-NEXT:    movl 16(%ecx), %esi
; FALLBACK31-NEXT:    movl 20(%ecx), %edi
; FALLBACK31-NEXT:    movl 24(%ecx), %ebx
; FALLBACK31-NEXT:    movl 28(%ecx), %edx
; FALLBACK31-NEXT:    movzbl (%eax), %eax
; FALLBACK31-NEXT:    movl %eax, %ecx
; FALLBACK31-NEXT:    shlb $3, %cl
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    sarl $31, %edx
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    andb $28, %al
; FALLBACK31-NEXT:    movzbl %al, %ebx
; FALLBACK31-NEXT:    movl 48(%esp,%ebx), %esi
; FALLBACK31-NEXT:    movl 44(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 40(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 56(%esp,%ebx), %ebp
; FALLBACK31-NEXT:    movl 52(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, %edi
; FALLBACK31-NEXT:    shrdl %cl, %ebp, %edi
; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK31-NEXT:    movl 60(%esp,%ebx), %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebp
; FALLBACK31-NEXT:    movl 32(%esp,%ebx), %edx
; FALLBACK31-NEXT:    movl 36(%esp,%ebx), %ebx
; FALLBACK31-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl %ebx, 4(%eax)
; FALLBACK31-NEXT:    movl %ebp, 24(%eax)
; FALLBACK31-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; FALLBACK31-NEXT:    movl %ebx, 28(%eax)
; FALLBACK31-NEXT:    movl %esi, 16(%eax)
; FALLBACK31-NEXT:    movl %edi, 20(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    movl %esi, 8(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    movl %esi, 12(%eax)
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, (%eax)
; FALLBACK31-NEXT:    addl $108, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %byteOff = load i256, ptr %byteOff.ptr, align 1
  %bitOff = shl i256 %byteOff, 3
  %res = ashr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: ashr_32bytes_dwordOff:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rcx
; FALLBACK0-NEXT:    movq 8(%rdi), %r8
; FALLBACK0-NEXT:    movq 16(%rdi), %r9
; FALLBACK0-NEXT:    movq 24(%rdi), %rdi
; FALLBACK0-NEXT:    movzbl (%rsi), %esi
; FALLBACK0-NEXT:    movl %esi, %eax
; FALLBACK0-NEXT:    shlb $5, %al
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    sarq $63, %rdi
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    andb $6, %sil
; FALLBACK0-NEXT:    movzbl %sil, %r9d
; FALLBACK0-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK0-NEXT:    movq -56(%rsp,%r9,4), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -48(%rsp,%r9,4), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r11, %r8
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %rdi, %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rdi
; FALLBACK0-NEXT:    orq %r10, %rdi
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK0-NEXT:    leaq (%r9,%r9), %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    sarq %cl, %r9
; FALLBACK0-NEXT:    movq %r9, 24(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %rdi, (%rdx)
; FALLBACK0-NEXT:    movq %r8, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: ashr_32bytes_dwordOff:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %rdi
; FALLBACK1-NEXT:    movzbl (%rsi), %esi
; FALLBACK1-NEXT:    movl %esi, %ecx
; FALLBACK1-NEXT:    shlb $5, %cl
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    sarq $63, %rdi
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    andb $6, %sil
; FALLBACK1-NEXT:    movzbl %sil, %eax
; FALLBACK1-NEXT:    movq -56(%rsp,%rax,4), %rsi
; FALLBACK1-NEXT:    movq -72(%rsp,%rax,4), %rdi
; FALLBACK1-NEXT:    movq -64(%rsp,%rax,4), %r8
; FALLBACK1-NEXT:    movq %r8, %r9
; FALLBACK1-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK1-NEXT:    movq -48(%rsp,%rax,4), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK1-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK1-NEXT:    sarq %cl, %rax
; FALLBACK1-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rax, 24(%rdx)
; FALLBACK1-NEXT:    movq %rdi, (%rdx)
; FALLBACK1-NEXT:    movq %r9, 8(%rdx)
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: ashr_32bytes_dwordOff:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %rdi
; FALLBACK2-NEXT:    movzbl (%rsi), %esi
; FALLBACK2-NEXT:    movl %esi, %eax
; FALLBACK2-NEXT:    shlb $5, %al
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    sarq $63, %rdi
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    andb $6, %sil
; FALLBACK2-NEXT:    movzbl %sil, %ecx
; FALLBACK2-NEXT:    movq -64(%rsp,%rcx,4), %rsi
; FALLBACK2-NEXT:    movq -56(%rsp,%rcx,4), %rdi
; FALLBACK2-NEXT:    shrxq %rax, %rsi, %r8
; FALLBACK2-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %r9
; FALLBACK2-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK2-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK2-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK2-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK2-NEXT:    notb %al
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r8, %rdi
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r9, %rsi
; FALLBACK2-NEXT:    addq %rcx, %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %rax
; FALLBACK2-NEXT:    orq %r10, %rax
; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
; FALLBACK2-NEXT:    movq %rax, 16(%rdx)
; FALLBACK2-NEXT:    movq %rsi, (%rdx)
; FALLBACK2-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %rdi
; FALLBACK3-NEXT:    movzbl (%rsi), %esi
; FALLBACK3-NEXT:    movl %esi, %ecx
; FALLBACK3-NEXT:    shlb $5, %cl
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    sarq $63, %rdi
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    andb $6, %sil
; FALLBACK3-NEXT:    movzbl %sil, %eax
; FALLBACK3-NEXT:    movq -56(%rsp,%rax,4), %rsi
; FALLBACK3-NEXT:    movq -72(%rsp,%rax,4), %rdi
; FALLBACK3-NEXT:    movq -64(%rsp,%rax,4), %r8
; FALLBACK3-NEXT:    movq %r8, %r9
; FALLBACK3-NEXT:    shrdq %cl, %rsi, %r9
; FALLBACK3-NEXT:    movq -48(%rsp,%rax,4), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %rsi
; FALLBACK3-NEXT:    shrdq %cl, %r8, %rdi
; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rax, 24(%rdx)
; FALLBACK3-NEXT:    movq %rdi, (%rdx)
; FALLBACK3-NEXT:    movq %r9, 8(%rdx)
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: ashr_32bytes_dwordOff:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movq 16(%rdi), %rcx
; FALLBACK4-NEXT:    movq 24(%rdi), %rdi
; FALLBACK4-NEXT:    movzbl (%rsi), %esi
; FALLBACK4-NEXT:    movl %esi, %eax
; FALLBACK4-NEXT:    shlb $5, %al
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    sarq $63, %rdi
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    andb $6, %sil
; FALLBACK4-NEXT:    movzbl %sil, %r9d
; FALLBACK4-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK4-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rdi
; FALLBACK4-NEXT:    orq %r10, %rdi
; FALLBACK4-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK4-NEXT:    movq %r10, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK4-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r11, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r8, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    sarq %cl, %r9
; FALLBACK4-NEXT:    movq %r9, 24(%rdx)
; FALLBACK4-NEXT:    movq %r10, 8(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK4-NEXT:    movq %rdi, (%rdx)
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: ashr_32bytes_dwordOff:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movq 16(%rdi), %rax
; FALLBACK5-NEXT:    movq 24(%rdi), %rdi
; FALLBACK5-NEXT:    movzbl (%rsi), %esi
; FALLBACK5-NEXT:    movl %esi, %ecx
; FALLBACK5-NEXT:    shlb $5, %cl
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    sarq $63, %rdi
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    andb $6, %sil
; FALLBACK5-NEXT:    movzbl %sil, %eax
; FALLBACK5-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK5-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK5-NEXT:    movq %rdi, %r8
; FALLBACK5-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK5-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK5-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK5-NEXT:    movq %rax, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK5-NEXT:    sarq %cl, %rsi
; FALLBACK5-NEXT:    movq %r10, 8(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: ashr_32bytes_dwordOff:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movq 16(%rdi), %rcx
; FALLBACK6-NEXT:    movq 24(%rdi), %rdi
; FALLBACK6-NEXT:    movzbl (%rsi), %esi
; FALLBACK6-NEXT:    movl %esi, %eax
; FALLBACK6-NEXT:    shlb $5, %al
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    sarq $63, %rdi
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    andb $6, %sil
; FALLBACK6-NEXT:    movzbl %sil, %ecx
; FALLBACK6-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK6-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK6-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK6-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK6-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK6-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK6-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK6-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK6-NEXT:    notb %al
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK6-NEXT:    orq %rsi, %rdi
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK6-NEXT:    orq %r9, %rcx
; FALLBACK6-NEXT:    addq %r8, %r8
; FALLBACK6-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK6-NEXT:    orq %r10, %rax
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %rax, 8(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK6-NEXT:    movq %rdi, (%rdx)
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movq 16(%rdi), %rax
; FALLBACK7-NEXT:    movq 24(%rdi), %rdi
; FALLBACK7-NEXT:    movzbl (%rsi), %esi
; FALLBACK7-NEXT:    movl %esi, %ecx
; FALLBACK7-NEXT:    shlb $5, %cl
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    sarq $63, %rdi
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    andb $6, %sil
; FALLBACK7-NEXT:    movzbl %sil, %eax
; FALLBACK7-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK7-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK7-NEXT:    movq %rdi, %r8
; FALLBACK7-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK7-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK7-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK7-NEXT:    movq %rax, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK7-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK7-NEXT:    movq %r10, 8(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rax, 24(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: ashr_32bytes_dwordOff:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK8-NEXT:    movq 16(%rdi), %rcx
; FALLBACK8-NEXT:    movq 24(%rdi), %rdi
; FALLBACK8-NEXT:    movzbl (%rsi), %esi
; FALLBACK8-NEXT:    movl %esi, %eax
; FALLBACK8-NEXT:    shlb $5, %al
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    sarq $63, %rdi
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    andb $6, %sil
; FALLBACK8-NEXT:    movzbl %sil, %r9d
; FALLBACK8-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK8-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rdi
; FALLBACK8-NEXT:    orq %r10, %rdi
; FALLBACK8-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK8-NEXT:    movq %r10, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK8-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r11, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r8, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    sarq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, 24(%rdx)
; FALLBACK8-NEXT:    movq %r10, 8(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK8-NEXT:    movq %rdi, (%rdx)
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: ashr_32bytes_dwordOff:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK9-NEXT:    movq 16(%rdi), %rax
; FALLBACK9-NEXT:    movq 24(%rdi), %rdi
; FALLBACK9-NEXT:    movzbl (%rsi), %esi
; FALLBACK9-NEXT:    movl %esi, %ecx
; FALLBACK9-NEXT:    shlb $5, %cl
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    sarq $63, %rdi
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    andb $6, %sil
; FALLBACK9-NEXT:    movzbl %sil, %eax
; FALLBACK9-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK9-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK9-NEXT:    movq %rdi, %r8
; FALLBACK9-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK9-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK9-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK9-NEXT:    movq %rax, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK9-NEXT:    sarq %cl, %rsi
; FALLBACK9-NEXT:    movq %r10, 8(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: ashr_32bytes_dwordOff:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK10-NEXT:    movq 16(%rdi), %rcx
; FALLBACK10-NEXT:    movq 24(%rdi), %rdi
; FALLBACK10-NEXT:    movzbl (%rsi), %esi
; FALLBACK10-NEXT:    movl %esi, %eax
; FALLBACK10-NEXT:    shlb $5, %al
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    sarq $63, %rdi
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    andb $6, %sil
; FALLBACK10-NEXT:    movzbl %sil, %ecx
; FALLBACK10-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK10-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK10-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK10-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK10-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK10-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK10-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK10-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK10-NEXT:    notb %al
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK10-NEXT:    orq %rsi, %rdi
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK10-NEXT:    orq %r9, %rcx
; FALLBACK10-NEXT:    addq %r8, %r8
; FALLBACK10-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK10-NEXT:    orq %r10, %rax
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %rax, 8(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK10-NEXT:    movq %rdi, (%rdx)
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK11-NEXT:    movq 16(%rdi), %rax
; FALLBACK11-NEXT:    movq 24(%rdi), %rdi
; FALLBACK11-NEXT:    movzbl (%rsi), %esi
; FALLBACK11-NEXT:    movl %esi, %ecx
; FALLBACK11-NEXT:    shlb $5, %cl
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    sarq $63, %rdi
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    andb $6, %sil
; FALLBACK11-NEXT:    movzbl %sil, %eax
; FALLBACK11-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK11-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK11-NEXT:    movq %rdi, %r8
; FALLBACK11-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK11-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK11-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK11-NEXT:    movq %rax, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK11-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK11-NEXT:    movq %r10, 8(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: ashr_32bytes_dwordOff:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK12-NEXT:    movq 16(%rdi), %rcx
; FALLBACK12-NEXT:    movq 24(%rdi), %rdi
; FALLBACK12-NEXT:    movzbl (%rsi), %esi
; FALLBACK12-NEXT:    movl %esi, %eax
; FALLBACK12-NEXT:    shlb $5, %al
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    sarq $63, %rdi
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    andb $6, %sil
; FALLBACK12-NEXT:    movzbl %sil, %r9d
; FALLBACK12-NEXT:    movq -64(%rsp,%r9,4), %r10
; FALLBACK12-NEXT:    movq -56(%rsp,%r9,4), %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rdi
; FALLBACK12-NEXT:    orq %r10, %rdi
; FALLBACK12-NEXT:    movq -48(%rsp,%r9,4), %r10
; FALLBACK12-NEXT:    movq %r10, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    movq -40(%rsp,%r9,4), %r9
; FALLBACK12-NEXT:    leaq (%r9,%r9), %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r11, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r8, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    sarq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, 24(%rdx)
; FALLBACK12-NEXT:    movq %r10, 8(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK12-NEXT:    movq %rdi, (%rdx)
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: ashr_32bytes_dwordOff:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK13-NEXT:    movq 16(%rdi), %rax
; FALLBACK13-NEXT:    movq 24(%rdi), %rdi
; FALLBACK13-NEXT:    movzbl (%rsi), %esi
; FALLBACK13-NEXT:    movl %esi, %ecx
; FALLBACK13-NEXT:    shlb $5, %cl
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    sarq $63, %rdi
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    andb $6, %sil
; FALLBACK13-NEXT:    movzbl %sil, %eax
; FALLBACK13-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK13-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK13-NEXT:    movq %rdi, %r8
; FALLBACK13-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK13-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK13-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK13-NEXT:    movq %rax, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK13-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK13-NEXT:    sarq %cl, %rsi
; FALLBACK13-NEXT:    movq %r10, 8(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: ashr_32bytes_dwordOff:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK14-NEXT:    movq 16(%rdi), %rcx
; FALLBACK14-NEXT:    movq 24(%rdi), %rdi
; FALLBACK14-NEXT:    movzbl (%rsi), %esi
; FALLBACK14-NEXT:    movl %esi, %eax
; FALLBACK14-NEXT:    shlb $5, %al
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    sarq $63, %rdi
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    andb $6, %sil
; FALLBACK14-NEXT:    movzbl %sil, %ecx
; FALLBACK14-NEXT:    shrxq %rax, -72(%rsp,%rcx,4), %rsi
; FALLBACK14-NEXT:    movq -64(%rsp,%rcx,4), %rdi
; FALLBACK14-NEXT:    movq -56(%rsp,%rcx,4), %r8
; FALLBACK14-NEXT:    shrxq %rax, %r8, %r9
; FALLBACK14-NEXT:    movq -48(%rsp,%rcx,4), %rcx
; FALLBACK14-NEXT:    shrxq %rax, %rdi, %r10
; FALLBACK14-NEXT:    sarxq %rax, %rcx, %r11
; FALLBACK14-NEXT:    # kill: def $al killed $al killed $rax def $rax
; FALLBACK14-NEXT:    notb %al
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rax, %rdi, %rdi
; FALLBACK14-NEXT:    orq %rsi, %rdi
; FALLBACK14-NEXT:    addq %rcx, %rcx
; FALLBACK14-NEXT:    shlxq %rax, %rcx, %rcx
; FALLBACK14-NEXT:    orq %r9, %rcx
; FALLBACK14-NEXT:    addq %r8, %r8
; FALLBACK14-NEXT:    shlxq %rax, %r8, %rax
; FALLBACK14-NEXT:    orq %r10, %rax
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK14-NEXT:    movq %rdi, (%rdx)
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    vmovups (%rdi), %xmm0
; FALLBACK15-NEXT:    movq 16(%rdi), %rax
; FALLBACK15-NEXT:    movq 24(%rdi), %rdi
; FALLBACK15-NEXT:    movzbl (%rsi), %esi
; FALLBACK15-NEXT:    movl %esi, %ecx
; FALLBACK15-NEXT:    shlb $5, %cl
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    sarq $63, %rdi
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    andb $6, %sil
; FALLBACK15-NEXT:    movzbl %sil, %eax
; FALLBACK15-NEXT:    movq -48(%rsp,%rax,4), %rsi
; FALLBACK15-NEXT:    movq -56(%rsp,%rax,4), %rdi
; FALLBACK15-NEXT:    movq %rdi, %r8
; FALLBACK15-NEXT:    shrdq %cl, %rsi, %r8
; FALLBACK15-NEXT:    movq -72(%rsp,%rax,4), %r9
; FALLBACK15-NEXT:    movq -64(%rsp,%rax,4), %rax
; FALLBACK15-NEXT:    movq %rax, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r10
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r9
; FALLBACK15-NEXT:    sarxq %rcx, %rsi, %rax
; FALLBACK15-NEXT:    movq %r10, 8(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    retq
;
; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %edi
; X86-SSE2-NEXT:    movl 12(%eax), %ebx
; X86-SSE2-NEXT:    movl 16(%eax), %ebp
; X86-SSE2-NEXT:    movl 20(%eax), %esi
; X86-SSE2-NEXT:    movl 24(%eax), %edx
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movzbl (%eax), %eax
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    sarl $31, %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $7, %eax
; X86-SSE2-NEXT:    movl 16(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%esp,%eax,4), %esi
; X86-SSE2-NEXT:    movl 24(%esp,%eax,4), %edi
; X86-SSE2-NEXT:    movl 36(%esp,%eax,4), %ebx
; X86-SSE2-NEXT:    movl 32(%esp,%eax,4), %ebp
; X86-SSE2-NEXT:    movl 44(%esp,%eax,4), %edx
; X86-SSE2-NEXT:    movl 40(%esp,%eax,4), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl %edx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: ashr_32bytes_dwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    pushl %ebx
; X86-SSE42-NEXT:    pushl %edi
; X86-SSE42-NEXT:    pushl %esi
; X86-SSE42-NEXT:    subl $64, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movl 16(%edx), %esi
; X86-SSE42-NEXT:    movl 20(%edx), %edi
; X86-SSE42-NEXT:    movl 24(%edx), %ebx
; X86-SSE42-NEXT:    movl 28(%edx), %edx
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    sarl $31, %edx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    andl $7, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,4), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $64, %esp
; X86-SSE42-NEXT:    popl %esi
; X86-SSE42-NEXT:    popl %edi
; X86-SSE42-NEXT:    popl %ebx
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: ashr_32bytes_dwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    pushl %ebx
; X86-AVX-NEXT:    pushl %edi
; X86-AVX-NEXT:    pushl %esi
; X86-AVX-NEXT:    subl $64, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %xmm0
; X86-AVX-NEXT:    movl 16(%edx), %esi
; X86-AVX-NEXT:    movl 20(%edx), %edi
; X86-AVX-NEXT:    movl 24(%edx), %ebx
; X86-AVX-NEXT:    movl 28(%edx), %edx
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
; X86-AVX-NEXT:    sarl $31, %edx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    andl $7, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,4), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $64, %esp
; X86-AVX-NEXT:    popl %esi
; X86-AVX-NEXT:    popl %edi
; X86-AVX-NEXT:    popl %ebx
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %dwordOff = load i256, ptr %dwordOff.ptr, align 1
  %bitOff = shl i256 %dwordOff, 5
  %res = ashr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %rdi
; X64-SSE2-NEXT:    movzbl (%rsi), %esi
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    sarq $63, %rdi
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    andl $3, %esi
; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rax
; X64-SSE2-NEXT:    movq -64(%rsp,%rsi,8), %rcx
; X64-SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdi
; X64-SSE2-NEXT:    movq -56(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT:    movq %rsi, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rax, (%rdx)
; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movq 16(%rdi), %rax
; X64-SSE42-NEXT:    movq 24(%rdi), %rcx
; X64-SSE42-NEXT:    movzbl (%rsi), %esi
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    sarq $63, %rcx
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    andl $3, %esi
; X64-SSE42-NEXT:    movups -72(%rsp,%rsi,8), %xmm0
; X64-SSE42-NEXT:    movups -56(%rsp,%rsi,8), %xmm1
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    retq
;
; X64-AVX-LABEL: ashr_32bytes_qwordOff:
; X64-AVX:       # %bb.0:
; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
; X64-AVX-NEXT:    movq 16(%rdi), %rax
; X64-AVX-NEXT:    movq 24(%rdi), %rcx
; X64-AVX-NEXT:    movzbl (%rsi), %esi
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    sarq $63, %rcx
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    andl $3, %esi
; X64-AVX-NEXT:    vmovups -72(%rsp,%rsi,8), %xmm0
; X64-AVX-NEXT:    vmovups -56(%rsp,%rsi,8), %xmm1
; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX-NEXT:    retq
;
; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $92, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %edi
; X86-SSE2-NEXT:    movl 12(%eax), %ebx
; X86-SSE2-NEXT:    movl 16(%eax), %ebp
; X86-SSE2-NEXT:    movl 20(%eax), %esi
; X86-SSE2-NEXT:    movl 24(%eax), %edx
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movzbl (%eax), %eax
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    sarl $31, %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $3, %eax
; X86-SSE2-NEXT:    movl 16(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%esp,%eax,8), %esi
; X86-SSE2-NEXT:    movl 24(%esp,%eax,8), %edi
; X86-SSE2-NEXT:    movl 36(%esp,%eax,8), %ebx
; X86-SSE2-NEXT:    movl 32(%esp,%eax,8), %ebp
; X86-SSE2-NEXT:    movl 44(%esp,%eax,8), %edx
; X86-SSE2-NEXT:    movl 40(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl %edx, 28(%eax)
; X86-SSE2-NEXT:    movl %ebp, 16(%eax)
; X86-SSE2-NEXT:    movl %ebx, 20(%eax)
; X86-SSE2-NEXT:    movl %edi, 8(%eax)
; X86-SSE2-NEXT:    movl %esi, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $92, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: ashr_32bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    pushl %ebx
; X86-SSE42-NEXT:    pushl %edi
; X86-SSE42-NEXT:    pushl %esi
; X86-SSE42-NEXT:    subl $64, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movl 16(%edx), %esi
; X86-SSE42-NEXT:    movl 20(%edx), %edi
; X86-SSE42-NEXT:    movl 24(%edx), %ebx
; X86-SSE42-NEXT:    movl 28(%edx), %edx
; X86-SSE42-NEXT:    movzbl (%ecx), %ecx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    sarl $31, %edx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    andl $3, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $64, %esp
; X86-SSE42-NEXT:    popl %esi
; X86-SSE42-NEXT:    popl %edi
; X86-SSE42-NEXT:    popl %ebx
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: ashr_32bytes_qwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    pushl %ebx
; X86-AVX-NEXT:    pushl %edi
; X86-AVX-NEXT:    pushl %esi
; X86-AVX-NEXT:    subl $64, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %xmm0
; X86-AVX-NEXT:    movl 16(%edx), %esi
; X86-AVX-NEXT:    movl 20(%edx), %edi
; X86-AVX-NEXT:    movl 24(%edx), %ebx
; X86-AVX-NEXT:    movl 28(%edx), %edx
; X86-AVX-NEXT:    movzbl (%ecx), %ecx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovaps %xmm0, (%esp)
; X86-AVX-NEXT:    sarl $31, %edx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    andl $3, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $64, %esp
; X86-AVX-NEXT:    popl %esi
; X86-AVX-NEXT:    popl %edi
; X86-AVX-NEXT:    popl %ebx
; X86-AVX-NEXT:    retl
  %src = load i256, ptr %src.ptr, align 1
  %qwordOff = load i256, ptr %qwordOff.ptr, align 1
  %bitOff = shl i256 %qwordOff, 6
  %res = ashr i256 %src, %bitOff
  store i256 %res, ptr %dst, align 1
  ret void
}

define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: lshr_64bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %r15
; FALLBACK0-NEXT:    pushq %r14
; FALLBACK0-NEXT:    pushq %r13
; FALLBACK0-NEXT:    pushq %r12
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rax
; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
; FALLBACK0-NEXT:    movq 16(%rdi), %r8
; FALLBACK0-NEXT:    movq 24(%rdi), %r9
; FALLBACK0-NEXT:    movq 32(%rdi), %r10
; FALLBACK0-NEXT:    movq 40(%rdi), %r11
; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
; FALLBACK0-NEXT:    movq 56(%rdi), %r14
; FALLBACK0-NEXT:    movl (%rsi), %edi
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
; FALLBACK0-NEXT:    andl $56, %eax
; FALLBACK0-NEXT:    andl $56, %edi
; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
; FALLBACK0-NEXT:    movq %r8, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r9
; FALLBACK0-NEXT:    orq %r11, %r9
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %r8, %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r10, %r8
; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r10
; FALLBACK0-NEXT:    movq %r10, %r15
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r15
; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %r14
; FALLBACK0-NEXT:    leaq (%r14,%r14), %r11
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    orq %r15, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    addq %r10, %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %rbx
; FALLBACK0-NEXT:    movq %rbx, %r12
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r12
; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r15
; FALLBACK0-NEXT:    orq %r12, %r15
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r14
; FALLBACK0-NEXT:    addq %rbx, %rbx
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rbx
; FALLBACK0-NEXT:    orq %r14, %rbx
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r13
; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %r14
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r14
; FALLBACK0-NEXT:    orq %r13, %r14
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rdi
; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
; FALLBACK0-NEXT:    movq %rbx, 32(%rdx)
; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
; FALLBACK0-NEXT:    movq %r8, (%rdx)
; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    popq %r12
; FALLBACK0-NEXT:    popq %r13
; FALLBACK0-NEXT:    popq %r14
; FALLBACK0-NEXT:    popq %r15
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: lshr_64bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    pushq %r15
; FALLBACK1-NEXT:    pushq %r14
; FALLBACK1-NEXT:    pushq %rbx
; FALLBACK1-NEXT:    movq (%rdi), %rcx
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %r10
; FALLBACK1-NEXT:    movq 32(%rdi), %r11
; FALLBACK1-NEXT:    movq 40(%rdi), %rbx
; FALLBACK1-NEXT:    movq 48(%rdi), %r14
; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
; FALLBACK1-NEXT:    movl (%rsi), %eax
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
; FALLBACK1-NEXT:    andl $56, %ecx
; FALLBACK1-NEXT:    andl $56, %eax
; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
; FALLBACK1-NEXT:    movq %r9, %r8
; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r11
; FALLBACK1-NEXT:    movq %r11, %rbx
; FALLBACK1-NEXT:    shrdq %cl, %r10, %rbx
; FALLBACK1-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r11
; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK1-NEXT:    movq %r14, %r15
; FALLBACK1-NEXT:    shrdq %cl, %r11, %r15
; FALLBACK1-NEXT:    shrdq %cl, %r14, %r10
; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %r11
; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    shrq %cl, %rax
; FALLBACK1-NEXT:    movq %r11, 48(%rdx)
; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
; FALLBACK1-NEXT:    movq %r10, 32(%rdx)
; FALLBACK1-NEXT:    movq %r15, 40(%rdx)
; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK1-NEXT:    movq %rsi, (%rdx)
; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
; FALLBACK1-NEXT:    popq %rbx
; FALLBACK1-NEXT:    popq %r14
; FALLBACK1-NEXT:    popq %r15
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: lshr_64bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    pushq %rbp
; FALLBACK2-NEXT:    pushq %r15
; FALLBACK2-NEXT:    pushq %r14
; FALLBACK2-NEXT:    pushq %r13
; FALLBACK2-NEXT:    pushq %r12
; FALLBACK2-NEXT:    pushq %rbx
; FALLBACK2-NEXT:    pushq %rax
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %r10
; FALLBACK2-NEXT:    movq 32(%rdi), %r11
; FALLBACK2-NEXT:    movq 40(%rdi), %rbx
; FALLBACK2-NEXT:    movq 48(%rdi), %r14
; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
; FALLBACK2-NEXT:    movl (%rsi), %eax
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
; FALLBACK2-NEXT:    andl $56, %ecx
; FALLBACK2-NEXT:    andl $56, %eax
; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
; FALLBACK2-NEXT:    movl %ecx, %r12d
; FALLBACK2-NEXT:    notb %r12b
; FALLBACK2-NEXT:    addq %r9, %r9
; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
; FALLBACK2-NEXT:    orq %rbx, %r9
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r13, %rdi
; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
; FALLBACK2-NEXT:    addq %r10, %r10
; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
; FALLBACK2-NEXT:    orq %r8, %r10
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r11, %rsi
; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
; FALLBACK2-NEXT:    orq %r15, %r8
; FALLBACK2-NEXT:    addq %r14, %r14
; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
; FALLBACK2-NEXT:    orq %rbp, %r11
; FALLBACK2-NEXT:    addq %rax, %rax
; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
; FALLBACK2-NEXT:    orq %r13, %rax
; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
; FALLBACK2-NEXT:    movq %rdi, (%rdx)
; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
; FALLBACK2-NEXT:    addq $8, %rsp
; FALLBACK2-NEXT:    popq %rbx
; FALLBACK2-NEXT:    popq %r12
; FALLBACK2-NEXT:    popq %r13
; FALLBACK2-NEXT:    popq %r14
; FALLBACK2-NEXT:    popq %r15
; FALLBACK2-NEXT:    popq %rbp
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: lshr_64bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    pushq %r15
; FALLBACK3-NEXT:    pushq %r14
; FALLBACK3-NEXT:    pushq %rbx
; FALLBACK3-NEXT:    movq (%rdi), %rcx
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %r10
; FALLBACK3-NEXT:    movq 32(%rdi), %r11
; FALLBACK3-NEXT:    movq 40(%rdi), %rbx
; FALLBACK3-NEXT:    movq 48(%rdi), %r14
; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
; FALLBACK3-NEXT:    movl (%rsi), %eax
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
; FALLBACK3-NEXT:    andl $56, %ecx
; FALLBACK3-NEXT:    andl $56, %eax
; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
; FALLBACK3-NEXT:    movq %r9, %r8
; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r11
; FALLBACK3-NEXT:    movq %r11, %rbx
; FALLBACK3-NEXT:    shrdq %cl, %r10, %rbx
; FALLBACK3-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r11
; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK3-NEXT:    movq %r14, %r15
; FALLBACK3-NEXT:    shrdq %cl, %r11, %r15
; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK3-NEXT:    movq %rsi, (%rdx)
; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
; FALLBACK3-NEXT:    popq %rbx
; FALLBACK3-NEXT:    popq %r14
; FALLBACK3-NEXT:    popq %r15
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: lshr_64bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbp
; FALLBACK4-NEXT:    pushq %r15
; FALLBACK4-NEXT:    pushq %r14
; FALLBACK4-NEXT:    pushq %r13
; FALLBACK4-NEXT:    pushq %r12
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    pushq %rax
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK4-NEXT:    movl (%rsi), %r8d
; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    leal (,%r8,8), %eax
; FALLBACK4-NEXT:    andl $56, %eax
; FALLBACK4-NEXT:    andl $56, %r8d
; FALLBACK4-NEXT:    movq -128(%rsp,%r8), %r10
; FALLBACK4-NEXT:    movq -120(%rsp,%r8), %r9
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r9,%r9), %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rdi
; FALLBACK4-NEXT:    orq %r10, %rdi
; FALLBACK4-NEXT:    movq -104(%rsp,%r8), %r10
; FALLBACK4-NEXT:    movq %r10, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rbx
; FALLBACK4-NEXT:    movq -96(%rsp,%r8), %r12
; FALLBACK4-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r11
; FALLBACK4-NEXT:    orq %rbx, %r11
; FALLBACK4-NEXT:    movq -112(%rsp,%r8), %rbx
; FALLBACK4-NEXT:    movq %rbx, %r14
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r14
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r14, %r10
; FALLBACK4-NEXT:    movq -88(%rsp,%r8), %r14
; FALLBACK4-NEXT:    movq %r14, %r13
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r13
; FALLBACK4-NEXT:    movq -80(%rsp,%r8), %rbp
; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r15
; FALLBACK4-NEXT:    orq %r13, %r15
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r12
; FALLBACK4-NEXT:    addq %r14, %r14
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r14
; FALLBACK4-NEXT:    orq %r12, %r14
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rbp
; FALLBACK4-NEXT:    movq -72(%rsp,%r8), %r8
; FALLBACK4-NEXT:    leaq (%r8,%r8), %r12
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r12
; FALLBACK4-NEXT:    orq %rbp, %r12
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r9
; FALLBACK4-NEXT:    addq %rbx, %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r9, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    movq %r8, 56(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
; FALLBACK4-NEXT:    movq %rdi, (%rdx)
; FALLBACK4-NEXT:    addq $8, %rsp
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    popq %r12
; FALLBACK4-NEXT:    popq %r13
; FALLBACK4-NEXT:    popq %r14
; FALLBACK4-NEXT:    popq %r15
; FALLBACK4-NEXT:    popq %rbp
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: lshr_64bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    pushq %r15
; FALLBACK5-NEXT:    pushq %r14
; FALLBACK5-NEXT:    pushq %rbx
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK5-NEXT:    movl (%rsi), %eax
; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
; FALLBACK5-NEXT:    andl $56, %ecx
; FALLBACK5-NEXT:    andl $56, %eax
; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq %r9, %rsi
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK5-NEXT:    movq %r10, %r8
; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK5-NEXT:    movq %r11, %rbx
; FALLBACK5-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK5-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK5-NEXT:    movq %rax, %r15
; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    shrq %cl, %r11
; FALLBACK5-NEXT:    movq %r15, 8(%rdx)
; FALLBACK5-NEXT:    movq %r9, 48(%rdx)
; FALLBACK5-NEXT:    movq %r11, 56(%rdx)
; FALLBACK5-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK5-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r14, (%rdx)
; FALLBACK5-NEXT:    popq %rbx
; FALLBACK5-NEXT:    popq %r14
; FALLBACK5-NEXT:    popq %r15
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: lshr_64bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    pushq %rbp
; FALLBACK6-NEXT:    pushq %r15
; FALLBACK6-NEXT:    pushq %r14
; FALLBACK6-NEXT:    pushq %r13
; FALLBACK6-NEXT:    pushq %r12
; FALLBACK6-NEXT:    pushq %rbx
; FALLBACK6-NEXT:    pushq %rax
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK6-NEXT:    movl (%rsi), %eax
; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    leal (,%rax,8), %esi
; FALLBACK6-NEXT:    andl $56, %esi
; FALLBACK6-NEXT:    andl $56, %eax
; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
; FALLBACK6-NEXT:    movl %esi, %ebx
; FALLBACK6-NEXT:    notb %bl
; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK6-NEXT:    orq %r11, %r8
; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK6-NEXT:    orq %r12, %r11
; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK6-NEXT:    orq %r9, %rdi
; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK6-NEXT:    orq %r14, %r9
; FALLBACK6-NEXT:    addq %r10, %r10
; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK6-NEXT:    orq %r15, %r10
; FALLBACK6-NEXT:    addq %rax, %rax
; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK6-NEXT:    orq %r13, %rax
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
; FALLBACK6-NEXT:    orq %rbp, %rcx
; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %r8, (%rdx)
; FALLBACK6-NEXT:    addq $8, %rsp
; FALLBACK6-NEXT:    popq %rbx
; FALLBACK6-NEXT:    popq %r12
; FALLBACK6-NEXT:    popq %r13
; FALLBACK6-NEXT:    popq %r14
; FALLBACK6-NEXT:    popq %r15
; FALLBACK6-NEXT:    popq %rbp
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: lshr_64bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    pushq %r15
; FALLBACK7-NEXT:    pushq %r14
; FALLBACK7-NEXT:    pushq %rbx
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK7-NEXT:    movl (%rsi), %eax
; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
; FALLBACK7-NEXT:    andl $56, %ecx
; FALLBACK7-NEXT:    andl $56, %eax
; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq %r9, %rsi
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK7-NEXT:    movq %r10, %r8
; FALLBACK7-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK7-NEXT:    movq %r11, %rbx
; FALLBACK7-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK7-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK7-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK7-NEXT:    movq %rax, %r15
; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK7-NEXT:    shrxq %rcx, %r11, %r10
; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK7-NEXT:    movq %r14, (%rdx)
; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
; FALLBACK7-NEXT:    popq %rbx
; FALLBACK7-NEXT:    popq %r14
; FALLBACK7-NEXT:    popq %r15
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: lshr_64bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbp
; FALLBACK8-NEXT:    pushq %r15
; FALLBACK8-NEXT:    pushq %r14
; FALLBACK8-NEXT:    pushq %r13
; FALLBACK8-NEXT:    pushq %r12
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    pushq %rax
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK8-NEXT:    movl (%rsi), %r9d
; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    leal (,%r9,8), %eax
; FALLBACK8-NEXT:    andl $56, %eax
; FALLBACK8-NEXT:    andl $56, %r9d
; FALLBACK8-NEXT:    movq -128(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq -120(%rsp,%r9), %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rdi
; FALLBACK8-NEXT:    orq %r10, %rdi
; FALLBACK8-NEXT:    movq -104(%rsp,%r9), %r10
; FALLBACK8-NEXT:    movq %r10, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rbx
; FALLBACK8-NEXT:    movq -96(%rsp,%r9), %r12
; FALLBACK8-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r11
; FALLBACK8-NEXT:    orq %rbx, %r11
; FALLBACK8-NEXT:    movq -112(%rsp,%r9), %rbx
; FALLBACK8-NEXT:    movq %rbx, %r14
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r14
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r14, %r10
; FALLBACK8-NEXT:    movq -88(%rsp,%r9), %r14
; FALLBACK8-NEXT:    movq %r14, %r13
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r13
; FALLBACK8-NEXT:    movq -80(%rsp,%r9), %rbp
; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r15
; FALLBACK8-NEXT:    orq %r13, %r15
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r12
; FALLBACK8-NEXT:    addq %r14, %r14
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r14
; FALLBACK8-NEXT:    orq %r12, %r14
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rbp
; FALLBACK8-NEXT:    movq -72(%rsp,%r9), %r9
; FALLBACK8-NEXT:    leaq (%r9,%r9), %r12
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r12
; FALLBACK8-NEXT:    orq %rbp, %r12
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    addq %rbx, %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r8, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r9
; FALLBACK8-NEXT:    movq %r9, 56(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
; FALLBACK8-NEXT:    movq %rdi, (%rdx)
; FALLBACK8-NEXT:    addq $8, %rsp
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    popq %r12
; FALLBACK8-NEXT:    popq %r13
; FALLBACK8-NEXT:    popq %r14
; FALLBACK8-NEXT:    popq %r15
; FALLBACK8-NEXT:    popq %rbp
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: lshr_64bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    pushq %r15
; FALLBACK9-NEXT:    pushq %r14
; FALLBACK9-NEXT:    pushq %rbx
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK9-NEXT:    movl (%rsi), %eax
; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
; FALLBACK9-NEXT:    andl $56, %ecx
; FALLBACK9-NEXT:    andl $56, %eax
; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq %r9, %rsi
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK9-NEXT:    movq %r10, %r8
; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK9-NEXT:    movq %r11, %rbx
; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK9-NEXT:    movq %rax, %r15
; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    shrq %cl, %r11
; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r14, (%rdx)
; FALLBACK9-NEXT:    popq %rbx
; FALLBACK9-NEXT:    popq %r14
; FALLBACK9-NEXT:    popq %r15
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: lshr_64bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    pushq %rbp
; FALLBACK10-NEXT:    pushq %r15
; FALLBACK10-NEXT:    pushq %r14
; FALLBACK10-NEXT:    pushq %r13
; FALLBACK10-NEXT:    pushq %r12
; FALLBACK10-NEXT:    pushq %rbx
; FALLBACK10-NEXT:    pushq %rax
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK10-NEXT:    movl (%rsi), %eax
; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    leal (,%rax,8), %esi
; FALLBACK10-NEXT:    andl $56, %esi
; FALLBACK10-NEXT:    andl $56, %eax
; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
; FALLBACK10-NEXT:    movl %esi, %ebx
; FALLBACK10-NEXT:    notb %bl
; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK10-NEXT:    orq %r11, %r8
; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK10-NEXT:    orq %r12, %r11
; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK10-NEXT:    orq %r9, %rdi
; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK10-NEXT:    orq %r14, %r9
; FALLBACK10-NEXT:    addq %r10, %r10
; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK10-NEXT:    orq %r15, %r10
; FALLBACK10-NEXT:    addq %rax, %rax
; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK10-NEXT:    orq %r13, %rax
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
; FALLBACK10-NEXT:    orq %rbp, %rcx
; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %r8, (%rdx)
; FALLBACK10-NEXT:    addq $8, %rsp
; FALLBACK10-NEXT:    popq %rbx
; FALLBACK10-NEXT:    popq %r12
; FALLBACK10-NEXT:    popq %r13
; FALLBACK10-NEXT:    popq %r14
; FALLBACK10-NEXT:    popq %r15
; FALLBACK10-NEXT:    popq %rbp
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: lshr_64bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    pushq %r15
; FALLBACK11-NEXT:    pushq %r14
; FALLBACK11-NEXT:    pushq %rbx
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK11-NEXT:    movl (%rsi), %eax
; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
; FALLBACK11-NEXT:    andl $56, %ecx
; FALLBACK11-NEXT:    andl $56, %eax
; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq %r9, %rsi
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK11-NEXT:    movq %r10, %r8
; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK11-NEXT:    movq %r11, %rbx
; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK11-NEXT:    movq %rax, %r15
; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK11-NEXT:    shrxq %rcx, %r11, %r10
; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK11-NEXT:    movq %r14, (%rdx)
; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
; FALLBACK11-NEXT:    popq %rbx
; FALLBACK11-NEXT:    popq %r14
; FALLBACK11-NEXT:    popq %r15
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: lshr_64bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbp
; FALLBACK12-NEXT:    pushq %r15
; FALLBACK12-NEXT:    pushq %r14
; FALLBACK12-NEXT:    pushq %r13
; FALLBACK12-NEXT:    pushq %r12
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    pushq %rax
; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK12-NEXT:    movl (%rsi), %r9d
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    leal (,%r9,8), %eax
; FALLBACK12-NEXT:    andl $56, %eax
; FALLBACK12-NEXT:    andl $56, %r9d
; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rdi
; FALLBACK12-NEXT:    orq %r10, %rdi
; FALLBACK12-NEXT:    movq -104(%rsp,%r9), %r10
; FALLBACK12-NEXT:    movq %r10, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rbx
; FALLBACK12-NEXT:    movq -96(%rsp,%r9), %r12
; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r11
; FALLBACK12-NEXT:    orq %rbx, %r11
; FALLBACK12-NEXT:    movq -112(%rsp,%r9), %rbx
; FALLBACK12-NEXT:    movq %rbx, %r14
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r14
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r14, %r10
; FALLBACK12-NEXT:    movq -88(%rsp,%r9), %r14
; FALLBACK12-NEXT:    movq %r14, %r13
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r13
; FALLBACK12-NEXT:    movq -80(%rsp,%r9), %rbp
; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r15
; FALLBACK12-NEXT:    orq %r13, %r15
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r12
; FALLBACK12-NEXT:    addq %r14, %r14
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r14
; FALLBACK12-NEXT:    orq %r12, %r14
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rbp
; FALLBACK12-NEXT:    movq -72(%rsp,%r9), %r9
; FALLBACK12-NEXT:    leaq (%r9,%r9), %r12
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r12
; FALLBACK12-NEXT:    orq %rbp, %r12
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    addq %rbx, %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r8, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r9
; FALLBACK12-NEXT:    movq %r9, 56(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
; FALLBACK12-NEXT:    movq %rdi, (%rdx)
; FALLBACK12-NEXT:    addq $8, %rsp
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    popq %r12
; FALLBACK12-NEXT:    popq %r13
; FALLBACK12-NEXT:    popq %r14
; FALLBACK12-NEXT:    popq %r15
; FALLBACK12-NEXT:    popq %rbp
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: lshr_64bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    pushq %r15
; FALLBACK13-NEXT:    pushq %r14
; FALLBACK13-NEXT:    pushq %rbx
; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK13-NEXT:    movl (%rsi), %edi
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    leal (,%rdi,8), %ecx
; FALLBACK13-NEXT:    andl $56, %ecx
; FALLBACK13-NEXT:    andl $56, %edi
; FALLBACK13-NEXT:    movq -96(%rsp,%rdi), %rsi
; FALLBACK13-NEXT:    movq -104(%rsp,%rdi), %r9
; FALLBACK13-NEXT:    movq %r9, %rax
; FALLBACK13-NEXT:    shrdq %cl, %rsi, %rax
; FALLBACK13-NEXT:    movq -112(%rsp,%rdi), %r10
; FALLBACK13-NEXT:    movq %r10, %r8
; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK13-NEXT:    movq -80(%rsp,%rdi), %r9
; FALLBACK13-NEXT:    movq -88(%rsp,%rdi), %r11
; FALLBACK13-NEXT:    movq %r11, %rbx
; FALLBACK13-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK13-NEXT:    shrdq %cl, %r11, %rsi
; FALLBACK13-NEXT:    movq -72(%rsp,%rdi), %r11
; FALLBACK13-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK13-NEXT:    movq -128(%rsp,%rdi), %r14
; FALLBACK13-NEXT:    movq -120(%rsp,%rdi), %rdi
; FALLBACK13-NEXT:    movq %rdi, %r15
; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %r14
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    shrq %cl, %r11
; FALLBACK13-NEXT:    movq %r15, 8(%rdx)
; FALLBACK13-NEXT:    movq %r9, 48(%rdx)
; FALLBACK13-NEXT:    movq %r11, 56(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 32(%rdx)
; FALLBACK13-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rax, 24(%rdx)
; FALLBACK13-NEXT:    movq %r14, (%rdx)
; FALLBACK13-NEXT:    popq %rbx
; FALLBACK13-NEXT:    popq %r14
; FALLBACK13-NEXT:    popq %r15
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: lshr_64bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    pushq %rbp
; FALLBACK14-NEXT:    pushq %r15
; FALLBACK14-NEXT:    pushq %r14
; FALLBACK14-NEXT:    pushq %r13
; FALLBACK14-NEXT:    pushq %r12
; FALLBACK14-NEXT:    pushq %rbx
; FALLBACK14-NEXT:    pushq %rax
; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK14-NEXT:    movl (%rsi), %esi
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK14-NEXT:    andl $56, %ecx
; FALLBACK14-NEXT:    andl $56, %esi
; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
; FALLBACK14-NEXT:    movq -96(%rsp,%rsi), %r13
; FALLBACK14-NEXT:    shrxq %rcx, %rax, %r9
; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
; FALLBACK14-NEXT:    movl %ecx, %ebx
; FALLBACK14-NEXT:    notb %bl
; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK14-NEXT:    orq %r11, %r8
; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK14-NEXT:    orq %r12, %r11
; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK14-NEXT:    orq %r9, %rdi
; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK14-NEXT:    orq %r14, %r9
; FALLBACK14-NEXT:    addq %r10, %r10
; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK14-NEXT:    orq %r15, %r10
; FALLBACK14-NEXT:    addq %rsi, %rsi
; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
; FALLBACK14-NEXT:    orq %r13, %rsi
; FALLBACK14-NEXT:    addq %rax, %rax
; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK14-NEXT:    orq %rbp, %rax
; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
; FALLBACK14-NEXT:    movq %rsi, 48(%rdx)
; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %r8, (%rdx)
; FALLBACK14-NEXT:    addq $8, %rsp
; FALLBACK14-NEXT:    popq %rbx
; FALLBACK14-NEXT:    popq %r12
; FALLBACK14-NEXT:    popq %r13
; FALLBACK14-NEXT:    popq %r14
; FALLBACK14-NEXT:    popq %r15
; FALLBACK14-NEXT:    popq %rbp
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: lshr_64bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    pushq %r15
; FALLBACK15-NEXT:    pushq %r14
; FALLBACK15-NEXT:    pushq %rbx
; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK15-NEXT:    movl (%rsi), %eax
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
; FALLBACK15-NEXT:    andl $56, %ecx
; FALLBACK15-NEXT:    andl $56, %eax
; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq %r9, %rsi
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK15-NEXT:    movq %r10, %r8
; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK15-NEXT:    movq %r11, %rbx
; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK15-NEXT:    movq %rax, %r15
; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK15-NEXT:    movq %r14, (%rdx)
; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
; FALLBACK15-NEXT:    popq %rbx
; FALLBACK15-NEXT:    popq %r14
; FALLBACK15-NEXT:    popq %r15
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: lshr_64bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $204, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl (%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 12(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 16(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 20(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 24(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 28(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 32(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 36(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 40(%eax), %ebp
; FALLBACK16-NEXT:    movl 44(%eax), %ebx
; FALLBACK16-NEXT:    movl 48(%eax), %edi
; FALLBACK16-NEXT:    movl 52(%eax), %esi
; FALLBACK16-NEXT:    movl 56(%eax), %edx
; FALLBACK16-NEXT:    movl 60(%eax), %ecx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl (%eax), %eax
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %eax, %esi
; FALLBACK16-NEXT:    andl $60, %esi
; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK16-NEXT:    shll $3, %eax
; FALLBACK16-NEXT:    andl $24, %eax
; FALLBACK16-NEXT:    movl %edx, %edi
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    movl 72(%esp,%esi), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK16-NEXT:    movb %al, %ch
; FALLBACK16-NEXT:    notb %ch
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    addl %edx, %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %edi, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK16-NEXT:    movl %edx, %ebp
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %ebp, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    addl %edx, %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %ebx, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK16-NEXT:    movl %ebx, %ebp
; FALLBACK16-NEXT:    movl %eax, %edx
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    addl %eax, %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %ebp, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    addl %ebx, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK16-NEXT:    movl %ebx, %ebp
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %ebp, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    addl %ebx, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %eax, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK16-NEXT:    movl %ebx, %ebp
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %ebp, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    addl %ebx, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK16-NEXT:    movl %edi, %ebp
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %ebp, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    addl %edi, %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %edx, %edi
; FALLBACK16-NEXT:    movl %esi, %edx
; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK16-NEXT:    movl %esi, %ebx
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK16-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %ebx, %ebp
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    addl %esi, %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    orl %ebx, %esi
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %eax, %edx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK16-NEXT:    shrl %cl, %ebx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
; FALLBACK16-NEXT:    movl %edx, 56(%eax)
; FALLBACK16-NEXT:    movl %esi, 48(%eax)
; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
; FALLBACK16-NEXT:    movl %edi, 40(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, (%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
; FALLBACK16-NEXT:    addl $204, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: lshr_64bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $188, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 12(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 16(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 20(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 24(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 28(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 36(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
; FALLBACK17-NEXT:    movl 48(%ecx), %edi
; FALLBACK17-NEXT:    movl 52(%ecx), %esi
; FALLBACK17-NEXT:    movl 56(%ecx), %edx
; FALLBACK17-NEXT:    movl 60(%ecx), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %ecx
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ecx, %ebp
; FALLBACK17-NEXT:    andl $60, %ebp
; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shll $3, %ecx
; FALLBACK17-NEXT:    andl $24, %ecx
; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %esi
; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl %esi, %edx
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK17-NEXT:    shrl %cl, %eax
; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
; FALLBACK17-NEXT:    movl %ebx, (%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
; FALLBACK17-NEXT:    addl $188, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: lshr_64bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $204, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 12(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 16(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 20(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 24(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 28(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 32(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 36(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 40(%eax), %ebp
; FALLBACK18-NEXT:    movl 44(%eax), %ebx
; FALLBACK18-NEXT:    movl 48(%eax), %edi
; FALLBACK18-NEXT:    movl 52(%eax), %esi
; FALLBACK18-NEXT:    movl 56(%eax), %edx
; FALLBACK18-NEXT:    movl 60(%eax), %ecx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %eax
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %eax, %ecx
; FALLBACK18-NEXT:    leal (,%eax,8), %edx
; FALLBACK18-NEXT:    andl $24, %edx
; FALLBACK18-NEXT:    andl $60, %ecx
; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
; FALLBACK18-NEXT:    movl %edx, %ebx
; FALLBACK18-NEXT:    notb %bl
; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
; FALLBACK18-NEXT:    orl %edi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK18-NEXT:    orl %edi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    orl %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    orl %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK18-NEXT:    movl %ecx, %edi
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK18-NEXT:    orl %ecx, %esi
; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK18-NEXT:    orl %edi, %ecx
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    addl %eax, %eax
; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK18-NEXT:    shrxl %edx, %ebp, %edx
; FALLBACK18-NEXT:    addl %ebp, %ebp
; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK18-NEXT:    orl %eax, %ebx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl %edx, 60(%eax)
; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
; FALLBACK18-NEXT:    movl %edi, 48(%eax)
; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
; FALLBACK18-NEXT:    movl %esi, 40(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, (%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $204, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: lshr_64bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $188, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 12(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 16(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 20(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 24(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 28(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 36(%ecx), %eax
; FALLBACK19-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%ecx), %ebp
; FALLBACK19-NEXT:    movl 44(%ecx), %ebx
; FALLBACK19-NEXT:    movl 48(%ecx), %edi
; FALLBACK19-NEXT:    movl 52(%ecx), %esi
; FALLBACK19-NEXT:    movl 56(%ecx), %edx
; FALLBACK19-NEXT:    movl 60(%ecx), %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %ecx
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ecx, %ebp
; FALLBACK19-NEXT:    andl $60, %ebp
; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shll $3, %ecx
; FALLBACK19-NEXT:    andl $24, %ecx
; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %esi
; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl %edi, %edx
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
; FALLBACK19-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl %edi, (%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
; FALLBACK19-NEXT:    addl $188, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: lshr_64bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $204, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK20-NEXT:    movl (%eax), %eax
; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %eax, %esi
; FALLBACK20-NEXT:    andl $60, %esi
; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK20-NEXT:    shll $3, %eax
; FALLBACK20-NEXT:    andl $24, %eax
; FALLBACK20-NEXT:    movl %edx, %edi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movb %al, %ch
; FALLBACK20-NEXT:    notb %ch
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %edx, %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %edi, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK20-NEXT:    movl %edx, %ebp
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %ebp, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %edx, %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %ebx, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    addl %eax, %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %eax, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK20-NEXT:    movl %edi, %ebp
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %ebp, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    addl %edi, %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    orl %edx, %edi
; FALLBACK20-NEXT:    movl %esi, %edx
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK20-NEXT:    movl %esi, %ebx
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %ebx, %ebp
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %eax, %edx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
; FALLBACK20-NEXT:    movl %edx, 56(%eax)
; FALLBACK20-NEXT:    movl %esi, 48(%eax)
; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
; FALLBACK20-NEXT:    movl %edi, 40(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, (%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
; FALLBACK20-NEXT:    addl $204, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: lshr_64bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $188, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movups (%ecx), %xmm0
; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK21-NEXT:    movl (%eax), %ecx
; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %ecx, %ebp
; FALLBACK21-NEXT:    andl $60, %ebp
; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shll $3, %ecx
; FALLBACK21-NEXT:    andl $24, %ecx
; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %esi
; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl %esi, %edx
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edi
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edi
; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK21-NEXT:    shrl %cl, %eax
; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
; FALLBACK21-NEXT:    movl %ebx, (%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
; FALLBACK21-NEXT:    addl $188, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: lshr_64bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $204, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK22-NEXT:    movl (%eax), %ecx
; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
; FALLBACK22-NEXT:    andl $24, %edx
; FALLBACK22-NEXT:    andl $60, %ecx
; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
; FALLBACK22-NEXT:    movl %edx, %ebx
; FALLBACK22-NEXT:    notb %bl
; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
; FALLBACK22-NEXT:    orl %edi, %ebp
; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %edi, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK22-NEXT:    orl %eax, %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK22-NEXT:    orl %eax, %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl %ecx, %eax
; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK22-NEXT:    orl %edi, %ecx
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %eax, %eax
; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
; FALLBACK22-NEXT:    addl %ebp, %ebp
; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK22-NEXT:    orl %eax, %ebx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl %edx, 60(%eax)
; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
; FALLBACK22-NEXT:    movl %edi, 48(%eax)
; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
; FALLBACK22-NEXT:    movl %esi, 40(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, (%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
; FALLBACK22-NEXT:    addl $204, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: lshr_64bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $188, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movups (%ecx), %xmm0
; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK23-NEXT:    movl (%eax), %ecx
; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %ecx, %ebp
; FALLBACK23-NEXT:    andl $60, %ebp
; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shll $3, %ecx
; FALLBACK23-NEXT:    andl $24, %ecx
; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %esi
; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl %edi, %edx
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
; FALLBACK23-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl %edi, (%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
; FALLBACK23-NEXT:    addl $188, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: lshr_64bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $204, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK24-NEXT:    movl (%eax), %ecx
; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, %esi
; FALLBACK24-NEXT:    andl $60, %esi
; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK24-NEXT:    shll $3, %ecx
; FALLBACK24-NEXT:    andl $24, %ecx
; FALLBACK24-NEXT:    movl %edx, %edi
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    movl 72(%esp,%esi), %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%eax,%eax), %ebx
; FALLBACK24-NEXT:    movl %ecx, %ebp
; FALLBACK24-NEXT:    movb %cl, %ch
; FALLBACK24-NEXT:    notb %ch
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK24-NEXT:    movl %ebp, %eax
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %edx, %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %edi, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK24-NEXT:    movl %edx, %ebp
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %ebp, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %edx, %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %ebx, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    addl %eax, %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %eax, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK24-NEXT:    movl %edi, %ebp
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %ebp, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    addl %edi, %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    orl %edx, %edi
; FALLBACK24-NEXT:    movl %esi, %edx
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK24-NEXT:    movl %esi, %ebx
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %ebx, %ebp
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %eax, %edx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
; FALLBACK24-NEXT:    movl %edx, 56(%eax)
; FALLBACK24-NEXT:    movl %esi, 48(%eax)
; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
; FALLBACK24-NEXT:    movl %edi, 40(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, (%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
; FALLBACK24-NEXT:    addl $204, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    vzeroupper
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: lshr_64bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $188, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK25-NEXT:    movl (%eax), %ecx
; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %ecx, %ebp
; FALLBACK25-NEXT:    andl $60, %ebp
; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shll $3, %ecx
; FALLBACK25-NEXT:    andl $24, %ecx
; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %esi
; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl %esi, %edx
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edi
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edi
; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK25-NEXT:    shrl %cl, %eax
; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
; FALLBACK25-NEXT:    movl %ebx, (%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
; FALLBACK25-NEXT:    addl $188, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    vzeroupper
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: lshr_64bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $204, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK26-NEXT:    movl (%eax), %ecx
; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
; FALLBACK26-NEXT:    andl $24, %edx
; FALLBACK26-NEXT:    andl $60, %ecx
; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
; FALLBACK26-NEXT:    movl %edx, %ebx
; FALLBACK26-NEXT:    notb %bl
; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
; FALLBACK26-NEXT:    orl %edi, %ebp
; FALLBACK26-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %edi, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK26-NEXT:    orl %eax, %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK26-NEXT:    orl %eax, %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %eax, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 120(%esp,%ecx), %ebp
; FALLBACK26-NEXT:    leal (%ebp,%ebp), %eax
; FALLBACK26-NEXT:    shlxl %ebx, %eax, %esi
; FALLBACK26-NEXT:    movl 116(%esp,%ecx), %eax
; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK26-NEXT:    orl %edi, %esi
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %eax, %eax
; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK26-NEXT:    movl 124(%esp,%ecx), %ecx
; FALLBACK26-NEXT:    shrxl %edx, %ecx, %edx
; FALLBACK26-NEXT:    addl %ecx, %ecx
; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ebx
; FALLBACK26-NEXT:    orl %eax, %ebx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    movl %edx, 60(%ecx)
; FALLBACK26-NEXT:    movl %ebx, 56(%ecx)
; FALLBACK26-NEXT:    movl %edi, 48(%ecx)
; FALLBACK26-NEXT:    movl %esi, 52(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 40(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 44(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 32(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 36(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 24(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 28(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 16(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 20(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 8(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 12(%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, (%ecx)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    movl %eax, 4(%ecx)
; FALLBACK26-NEXT:    addl $204, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    vzeroupper
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: lshr_64bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $188, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK27-NEXT:    movl (%eax), %ecx
; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %ecx, %ebp
; FALLBACK27-NEXT:    andl $60, %ebp
; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shll $3, %ecx
; FALLBACK27-NEXT:    andl $24, %ecx
; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %esi
; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl %edi, %edx
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
; FALLBACK27-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl %edi, (%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
; FALLBACK27-NEXT:    addl $188, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    vzeroupper
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: lshr_64bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $204, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK28-NEXT:    movl (%eax), %ecx
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, %esi
; FALLBACK28-NEXT:    andl $60, %esi
; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK28-NEXT:    shll $3, %ecx
; FALLBACK28-NEXT:    andl $24, %ecx
; FALLBACK28-NEXT:    movl %edx, %edi
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    movl 72(%esp,%esi), %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%eax,%eax), %ebx
; FALLBACK28-NEXT:    movl %ecx, %ebp
; FALLBACK28-NEXT:    movb %cl, %ch
; FALLBACK28-NEXT:    notb %ch
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK28-NEXT:    movl %ebp, %eax
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %edx, %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %edi, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK28-NEXT:    movl %edx, %ebp
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %ebp, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %edx, %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %ebx, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    addl %eax, %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %eax, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK28-NEXT:    movl %edi, %ebp
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %ebp, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    addl %edi, %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    orl %edx, %edi
; FALLBACK28-NEXT:    movl %esi, %edx
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK28-NEXT:    movl %esi, %ebx
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %ebx, %ebp
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %eax, %edx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
; FALLBACK28-NEXT:    movl %edx, 56(%eax)
; FALLBACK28-NEXT:    movl %esi, 48(%eax)
; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
; FALLBACK28-NEXT:    movl %edi, 40(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, (%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
; FALLBACK28-NEXT:    addl $204, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    vzeroupper
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: lshr_64bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $188, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK29-NEXT:    movl (%eax), %ecx
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %ecx, %ebp
; FALLBACK29-NEXT:    andl $60, %ebp
; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shll $3, %ecx
; FALLBACK29-NEXT:    andl $24, %ecx
; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %esi
; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl %esi, %edx
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edi
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edi
; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK29-NEXT:    shrl %cl, %eax
; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
; FALLBACK29-NEXT:    movl %ebx, (%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
; FALLBACK29-NEXT:    addl $188, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    vzeroupper
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: lshr_64bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $204, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK30-NEXT:    movl (%eax), %edx
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
; FALLBACK30-NEXT:    andl $24, %ecx
; FALLBACK30-NEXT:    andl $60, %edx
; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
; FALLBACK30-NEXT:    movl %ecx, %ebx
; FALLBACK30-NEXT:    notb %bl
; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
; FALLBACK30-NEXT:    orl %edi, %ebp
; FALLBACK30-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %edi, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK30-NEXT:    orl %eax, %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK30-NEXT:    orl %eax, %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %eax, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 120(%esp,%edx), %ebp
; FALLBACK30-NEXT:    leal (%ebp,%ebp), %eax
; FALLBACK30-NEXT:    shlxl %ebx, %eax, %esi
; FALLBACK30-NEXT:    movl 116(%esp,%edx), %eax
; FALLBACK30-NEXT:    shrxl %ecx, %eax, %edi
; FALLBACK30-NEXT:    orl %edi, %esi
; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %eax, %eax
; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrxl %ecx, %ebp, %eax
; FALLBACK30-NEXT:    movl 124(%esp,%edx), %edx
; FALLBACK30-NEXT:    shrxl %ecx, %edx, %ebp
; FALLBACK30-NEXT:    leal (%edx,%edx), %ecx
; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %edx
; FALLBACK30-NEXT:    orl %eax, %edx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    movl %ebp, 60(%ecx)
; FALLBACK30-NEXT:    movl %edx, 56(%ecx)
; FALLBACK30-NEXT:    movl %edi, 48(%ecx)
; FALLBACK30-NEXT:    movl %esi, 52(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 40(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 44(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 32(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 36(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 24(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 28(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 16(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 20(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 8(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 12(%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, (%ecx)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    movl %eax, 4(%ecx)
; FALLBACK30-NEXT:    addl $204, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    vzeroupper
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: lshr_64bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $188, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK31-NEXT:    movl (%eax), %ecx
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %ecx, %ebp
; FALLBACK31-NEXT:    andl $60, %ebp
; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shll $3, %ecx
; FALLBACK31-NEXT:    andl $24, %ecx
; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %esi
; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl %edi, %edx
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
; FALLBACK31-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl %edi, (%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
; FALLBACK31-NEXT:    addl $188, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    vzeroupper
; FALLBACK31-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %byteOff = load i512, ptr %byteOff.ptr, align 1
  %bitOff = shl i512 %byteOff, 3
  %res = lshr i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    pushq %rbx
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %r9
; X64-SSE2-NEXT:    movq 32(%rdi), %r10
; X64-SSE2-NEXT:    movq 40(%rdi), %r11
; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
; X64-SSE2-NEXT:    movl (%rsi), %esi
; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    andl $7, %esi
; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rax, (%rdx)
; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
; X64-SSE2-NEXT:    popq %rbx
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    pushq %rax
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
; X64-SSE42-NEXT:    movl (%rsi), %eax
; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    andl $7, %eax
; X64-SSE42-NEXT:    movups -128(%rsp,%rax,8), %xmm0
; X64-SSE42-NEXT:    movups -112(%rsp,%rax,8), %xmm1
; X64-SSE42-NEXT:    movups -96(%rsp,%rax,8), %xmm2
; X64-SSE42-NEXT:    movups -80(%rsp,%rax,8), %xmm3
; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    popq %rax
; X64-SSE42-NEXT:    retq
;
; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
; X64-AVX1:       # %bb.0:
; X64-AVX1-NEXT:    pushq %rax
; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT:    movl (%rsi), %eax
; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    andl $7, %eax
; X64-AVX1-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
; X64-AVX1-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
; X64-AVX1-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
; X64-AVX1-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX1-NEXT:    popq %rax
; X64-AVX1-NEXT:    vzeroupper
; X64-AVX1-NEXT:    retq
;
; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
; X64-AVX512:       # %bb.0:
; X64-AVX512-NEXT:    pushq %rax
; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
; X64-AVX512-NEXT:    movl (%rsi), %eax
; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT:    andl $7, %eax
; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX512-NEXT:    popq %rax
; X64-AVX512-NEXT:    vzeroupper
; X64-AVX512-NEXT:    retq
;
; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $188, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 12(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 16(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 24(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 32(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 36(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 40(%eax), %ebp
; X86-SSE2-NEXT:    movl 44(%eax), %ebx
; X86-SSE2-NEXT:    movl 48(%eax), %edi
; X86-SSE2-NEXT:    movl 52(%eax), %esi
; X86-SSE2-NEXT:    movl 56(%eax), %edx
; X86-SSE2-NEXT:    movl 60(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %eax
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $7, %eax
; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
; X86-SSE2-NEXT:    movl %edx, 60(%eax)
; X86-SSE2-NEXT:    movl %esi, 48(%eax)
; X86-SSE2-NEXT:    movl %edi, 52(%eax)
; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $188, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: lshr_64bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $140, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
; X86-SSE42-NEXT:    movl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    andl $7, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $140, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX1-LABEL: lshr_64bytes_qwordOff:
; X86-AVX1:       # %bb.0:
; X86-AVX1-NEXT:    subl $140, %esp
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
; X86-AVX1-NEXT:    movl (%ecx), %ecx
; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
; X86-AVX1-NEXT:    andl $7, %ecx
; X86-AVX1-NEXT:    vmovups (%esp,%ecx,8), %xmm0
; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX1-NEXT:    addl $140, %esp
; X86-AVX1-NEXT:    vzeroupper
; X86-AVX1-NEXT:    retl
;
; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
; X86-AVX512:       # %bb.0:
; X86-AVX512-NEXT:    subl $140, %esp
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
; X86-AVX512-NEXT:    movl (%ecx), %ecx
; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; X86-AVX512-NEXT:    vmovups %zmm0, (%esp)
; X86-AVX512-NEXT:    andl $7, %ecx
; X86-AVX512-NEXT:    vmovups (%esp,%ecx,8), %xmm0
; X86-AVX512-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX512-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
; X86-AVX512-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX512-NEXT:    addl $140, %esp
; X86-AVX512-NEXT:    vzeroupper
; X86-AVX512-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
  %bitOff = shl i512 %qwordOff, 6
  %res = lshr i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: shl_64bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %r15
; FALLBACK0-NEXT:    pushq %r14
; FALLBACK0-NEXT:    pushq %r13
; FALLBACK0-NEXT:    pushq %r12
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rax
; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
; FALLBACK0-NEXT:    movq 16(%rdi), %r8
; FALLBACK0-NEXT:    movq 24(%rdi), %r9
; FALLBACK0-NEXT:    movq 32(%rdi), %r10
; FALLBACK0-NEXT:    movq 40(%rdi), %r11
; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
; FALLBACK0-NEXT:    movq 56(%rdi), %rdi
; FALLBACK0-NEXT:    movl (%rsi), %esi
; FALLBACK0-NEXT:    xorps %xmm0, %xmm0
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
; FALLBACK0-NEXT:    andl $56, %eax
; FALLBACK0-NEXT:    andl $56, %esi
; FALLBACK0-NEXT:    negl %esi
; FALLBACK0-NEXT:    movslq %esi, %rbx
; FALLBACK0-NEXT:    movq -64(%rsp,%rbx), %r8
; FALLBACK0-NEXT:    movq -56(%rsp,%rbx), %rdi
; FALLBACK0-NEXT:    movq %rdi, %r10
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq %r8, %r9
; FALLBACK0-NEXT:    shrq %r9
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r9
; FALLBACK0-NEXT:    orq %r10, %r9
; FALLBACK0-NEXT:    movq -40(%rsp,%rbx), %r10
; FALLBACK0-NEXT:    movq %r10, %r14
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r14
; FALLBACK0-NEXT:    movq -48(%rsp,%rbx), %r15
; FALLBACK0-NEXT:    movq %r15, %r11
; FALLBACK0-NEXT:    shrq %r11
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    orq %r14, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r15
; FALLBACK0-NEXT:    shrq %rdi
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rdi
; FALLBACK0-NEXT:    orq %r15, %rdi
; FALLBACK0-NEXT:    movq -24(%rsp,%rbx), %r14
; FALLBACK0-NEXT:    movq %r14, %r12
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r12
; FALLBACK0-NEXT:    movq -32(%rsp,%rbx), %r13
; FALLBACK0-NEXT:    movq %r13, %r15
; FALLBACK0-NEXT:    shrq %r15
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r15
; FALLBACK0-NEXT:    orq %r12, %r15
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r13
; FALLBACK0-NEXT:    shrq %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    orq %r13, %r10
; FALLBACK0-NEXT:    movq -8(%rsp,%rbx), %r12
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r12
; FALLBACK0-NEXT:    movq -16(%rsp,%rbx), %rbx
; FALLBACK0-NEXT:    movq %rbx, %r13
; FALLBACK0-NEXT:    shrq %r13
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r13
; FALLBACK0-NEXT:    orq %r12, %r13
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rbx
; FALLBACK0-NEXT:    shrq %r14
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r14
; FALLBACK0-NEXT:    orq %rbx, %r14
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    movq %r8, (%rdx)
; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
; FALLBACK0-NEXT:    movq %r13, 56(%rdx)
; FALLBACK0-NEXT:    movq %r10, 32(%rdx)
; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
; FALLBACK0-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    popq %r12
; FALLBACK0-NEXT:    popq %r13
; FALLBACK0-NEXT:    popq %r14
; FALLBACK0-NEXT:    popq %r15
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: shl_64bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    pushq %r14
; FALLBACK1-NEXT:    pushq %rbx
; FALLBACK1-NEXT:    pushq %rax
; FALLBACK1-NEXT:    movq (%rdi), %rax
; FALLBACK1-NEXT:    movq 8(%rdi), %rcx
; FALLBACK1-NEXT:    movq 16(%rdi), %r8
; FALLBACK1-NEXT:    movq 24(%rdi), %r9
; FALLBACK1-NEXT:    movq 32(%rdi), %r10
; FALLBACK1-NEXT:    movq 40(%rdi), %r11
; FALLBACK1-NEXT:    movq 48(%rdi), %rbx
; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
; FALLBACK1-NEXT:    movl (%rsi), %esi
; FALLBACK1-NEXT:    xorps %xmm0, %xmm0
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK1-NEXT:    andl $56, %ecx
; FALLBACK1-NEXT:    andl $56, %esi
; FALLBACK1-NEXT:    negl %esi
; FALLBACK1-NEXT:    movslq %esi, %r9
; FALLBACK1-NEXT:    movq -48(%rsp,%r9), %rax
; FALLBACK1-NEXT:    movq -40(%rsp,%r9), %r10
; FALLBACK1-NEXT:    movq %r10, %rsi
; FALLBACK1-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK1-NEXT:    movq -64(%rsp,%r9), %r8
; FALLBACK1-NEXT:    movq -56(%rsp,%r9), %rdi
; FALLBACK1-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK1-NEXT:    movq -32(%rsp,%r9), %r11
; FALLBACK1-NEXT:    movq -24(%rsp,%r9), %rbx
; FALLBACK1-NEXT:    movq %rbx, %r14
; FALLBACK1-NEXT:    shldq %cl, %r11, %r14
; FALLBACK1-NEXT:    shldq %cl, %r10, %r11
; FALLBACK1-NEXT:    movq -16(%rsp,%r9), %r10
; FALLBACK1-NEXT:    movq -8(%rsp,%r9), %r9
; FALLBACK1-NEXT:    shldq %cl, %r10, %r9
; FALLBACK1-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK1-NEXT:    shldq %cl, %r8, %rdi
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    shlq %cl, %r8
; FALLBACK1-NEXT:    movq %r10, 48(%rdx)
; FALLBACK1-NEXT:    movq %r9, 56(%rdx)
; FALLBACK1-NEXT:    movq %r11, 32(%rdx)
; FALLBACK1-NEXT:    movq %r14, 40(%rdx)
; FALLBACK1-NEXT:    movq %rax, 16(%rdx)
; FALLBACK1-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK1-NEXT:    movq %r8, (%rdx)
; FALLBACK1-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK1-NEXT:    addq $8, %rsp
; FALLBACK1-NEXT:    popq %rbx
; FALLBACK1-NEXT:    popq %r14
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: shl_64bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    pushq %rbp
; FALLBACK2-NEXT:    pushq %r15
; FALLBACK2-NEXT:    pushq %r14
; FALLBACK2-NEXT:    pushq %r13
; FALLBACK2-NEXT:    pushq %r12
; FALLBACK2-NEXT:    pushq %rbx
; FALLBACK2-NEXT:    pushq %rax
; FALLBACK2-NEXT:    movq (%rdi), %rax
; FALLBACK2-NEXT:    movq 8(%rdi), %rcx
; FALLBACK2-NEXT:    movq 16(%rdi), %r8
; FALLBACK2-NEXT:    movq 24(%rdi), %r9
; FALLBACK2-NEXT:    movq 32(%rdi), %r10
; FALLBACK2-NEXT:    movq 40(%rdi), %r11
; FALLBACK2-NEXT:    movq 48(%rdi), %rbx
; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
; FALLBACK2-NEXT:    movl (%rsi), %esi
; FALLBACK2-NEXT:    xorps %xmm0, %xmm0
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
; FALLBACK2-NEXT:    andl $56, %eax
; FALLBACK2-NEXT:    andl $56, %esi
; FALLBACK2-NEXT:    negl %esi
; FALLBACK2-NEXT:    movslq %esi, %rsi
; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r10
; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rcx
; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r9
; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r14
; FALLBACK2-NEXT:    shlxq %rax, %r14, %rbx
; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r8
; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
; FALLBACK2-NEXT:    shlxq %rax, %r10, %r12
; FALLBACK2-NEXT:    movl %eax, %r13d
; FALLBACK2-NEXT:    notb %r13b
; FALLBACK2-NEXT:    shrq %r10
; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
; FALLBACK2-NEXT:    orq %r9, %r10
; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r9
; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
; FALLBACK2-NEXT:    shrq %r14
; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
; FALLBACK2-NEXT:    orq %r11, %r14
; FALLBACK2-NEXT:    shlxq %rax, -8(%rsp,%rsi), %r11
; FALLBACK2-NEXT:    movq -16(%rsp,%rsi), %rsi
; FALLBACK2-NEXT:    shlxq %rax, %rsi, %rax
; FALLBACK2-NEXT:    shrq %rcx
; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
; FALLBACK2-NEXT:    orq %rbx, %rcx
; FALLBACK2-NEXT:    shrq %r9
; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
; FALLBACK2-NEXT:    orq %r15, %r9
; FALLBACK2-NEXT:    shrq %rdi
; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
; FALLBACK2-NEXT:    orq %rbp, %rdi
; FALLBACK2-NEXT:    shrq %rsi
; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r11, %rsi
; FALLBACK2-NEXT:    shrq %r8
; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
; FALLBACK2-NEXT:    orq %rax, %r8
; FALLBACK2-NEXT:    movq %r12, (%rdx)
; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
; FALLBACK2-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK2-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
; FALLBACK2-NEXT:    movq %rcx, 16(%rdx)
; FALLBACK2-NEXT:    movq %r14, 24(%rdx)
; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
; FALLBACK2-NEXT:    addq $8, %rsp
; FALLBACK2-NEXT:    popq %rbx
; FALLBACK2-NEXT:    popq %r12
; FALLBACK2-NEXT:    popq %r13
; FALLBACK2-NEXT:    popq %r14
; FALLBACK2-NEXT:    popq %r15
; FALLBACK2-NEXT:    popq %rbp
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: shl_64bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    pushq %r14
; FALLBACK3-NEXT:    pushq %rbx
; FALLBACK3-NEXT:    pushq %rax
; FALLBACK3-NEXT:    movq (%rdi), %rax
; FALLBACK3-NEXT:    movq 8(%rdi), %rcx
; FALLBACK3-NEXT:    movq 16(%rdi), %r8
; FALLBACK3-NEXT:    movq 24(%rdi), %r9
; FALLBACK3-NEXT:    movq 32(%rdi), %r10
; FALLBACK3-NEXT:    movq 40(%rdi), %r11
; FALLBACK3-NEXT:    movq 48(%rdi), %rbx
; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
; FALLBACK3-NEXT:    movl (%rsi), %esi
; FALLBACK3-NEXT:    xorps %xmm0, %xmm0
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
; FALLBACK3-NEXT:    andl $56, %ecx
; FALLBACK3-NEXT:    andl $56, %esi
; FALLBACK3-NEXT:    negl %esi
; FALLBACK3-NEXT:    movslq %esi, %r8
; FALLBACK3-NEXT:    movq -48(%rsp,%r8), %rax
; FALLBACK3-NEXT:    movq -40(%rsp,%r8), %r9
; FALLBACK3-NEXT:    movq %r9, %rsi
; FALLBACK3-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK3-NEXT:    movq -64(%rsp,%r8), %r10
; FALLBACK3-NEXT:    movq -56(%rsp,%r8), %rdi
; FALLBACK3-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK3-NEXT:    movq -32(%rsp,%r8), %r11
; FALLBACK3-NEXT:    movq -24(%rsp,%r8), %rbx
; FALLBACK3-NEXT:    movq %rbx, %r14
; FALLBACK3-NEXT:    shldq %cl, %r11, %r14
; FALLBACK3-NEXT:    shldq %cl, %r9, %r11
; FALLBACK3-NEXT:    movq -16(%rsp,%r8), %r9
; FALLBACK3-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK3-NEXT:    shldq %cl, %r9, %r8
; FALLBACK3-NEXT:    shldq %cl, %rbx, %r9
; FALLBACK3-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK3-NEXT:    shlxq %rcx, %r10, %rcx
; FALLBACK3-NEXT:    movq %r9, 48(%rdx)
; FALLBACK3-NEXT:    movq %r8, 56(%rdx)
; FALLBACK3-NEXT:    movq %r11, 32(%rdx)
; FALLBACK3-NEXT:    movq %r14, 40(%rdx)
; FALLBACK3-NEXT:    movq %rax, 16(%rdx)
; FALLBACK3-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK3-NEXT:    movq %rcx, (%rdx)
; FALLBACK3-NEXT:    movq %rdi, 8(%rdx)
; FALLBACK3-NEXT:    addq $8, %rsp
; FALLBACK3-NEXT:    popq %rbx
; FALLBACK3-NEXT:    popq %r14
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: shl_64bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %r15
; FALLBACK4-NEXT:    pushq %r14
; FALLBACK4-NEXT:    pushq %r13
; FALLBACK4-NEXT:    pushq %r12
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK4-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK4-NEXT:    movl (%rsi), %ecx
; FALLBACK4-NEXT:    xorps %xmm4, %xmm4
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
; FALLBACK4-NEXT:    andl $56, %eax
; FALLBACK4-NEXT:    andl $56, %ecx
; FALLBACK4-NEXT:    negl %ecx
; FALLBACK4-NEXT:    movslq %ecx, %r9
; FALLBACK4-NEXT:    movq -24(%rsp,%r9), %rdi
; FALLBACK4-NEXT:    movq %rdi, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    movq -32(%rsp,%r9), %r11
; FALLBACK4-NEXT:    movq %r11, %r8
; FALLBACK4-NEXT:    shrq %r8
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r8
; FALLBACK4-NEXT:    orq %r10, %r8
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r11
; FALLBACK4-NEXT:    movq -40(%rsp,%r9), %rbx
; FALLBACK4-NEXT:    movq %rbx, %r10
; FALLBACK4-NEXT:    shrq %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    orq %r11, %r10
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    movq -48(%rsp,%r9), %r15
; FALLBACK4-NEXT:    movq %r15, %r11
; FALLBACK4-NEXT:    shrq %r11
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r11
; FALLBACK4-NEXT:    orq %rbx, %r11
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r15
; FALLBACK4-NEXT:    movq -64(%rsp,%r9), %r14
; FALLBACK4-NEXT:    movq -56(%rsp,%r9), %r12
; FALLBACK4-NEXT:    movq %r12, %rbx
; FALLBACK4-NEXT:    shrq %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rbx
; FALLBACK4-NEXT:    orq %r15, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r12
; FALLBACK4-NEXT:    movq %r14, %r15
; FALLBACK4-NEXT:    shrq %r15
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r15
; FALLBACK4-NEXT:    orq %r12, %r15
; FALLBACK4-NEXT:    movq -16(%rsp,%r9), %r12
; FALLBACK4-NEXT:    movq %r12, %r13
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r13
; FALLBACK4-NEXT:    shrq %rdi
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rdi
; FALLBACK4-NEXT:    orq %r13, %rdi
; FALLBACK4-NEXT:    movq -8(%rsp,%r9), %r9
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r9
; FALLBACK4-NEXT:    shrq %r12
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r12
; FALLBACK4-NEXT:    orq %r9, %r12
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r14
; FALLBACK4-NEXT:    movq %r14, (%rdx)
; FALLBACK4-NEXT:    movq %r12, 56(%rdx)
; FALLBACK4-NEXT:    movq %rdi, 48(%rdx)
; FALLBACK4-NEXT:    movq %r15, 8(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
; FALLBACK4-NEXT:    movq %r10, 32(%rdx)
; FALLBACK4-NEXT:    movq %r8, 40(%rdx)
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    popq %r12
; FALLBACK4-NEXT:    popq %r13
; FALLBACK4-NEXT:    popq %r14
; FALLBACK4-NEXT:    popq %r15
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: shl_64bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    pushq %r15
; FALLBACK5-NEXT:    pushq %r14
; FALLBACK5-NEXT:    pushq %rbx
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK5-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK5-NEXT:    movl (%rsi), %eax
; FALLBACK5-NEXT:    xorps %xmm4, %xmm4
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
; FALLBACK5-NEXT:    andl $56, %ecx
; FALLBACK5-NEXT:    andl $56, %eax
; FALLBACK5-NEXT:    negl %eax
; FALLBACK5-NEXT:    movslq %eax, %r8
; FALLBACK5-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK5-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK5-NEXT:    movq %r9, %rsi
; FALLBACK5-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK5-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK5-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK5-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK5-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK5-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK5-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK5-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK5-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK5-NEXT:    movq %r14, %r15
; FALLBACK5-NEXT:    shldq %cl, %r9, %r15
; FALLBACK5-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK5-NEXT:    shldq %cl, %r14, %r8
; FALLBACK5-NEXT:    movq %r11, %r9
; FALLBACK5-NEXT:    shlq %cl, %r9
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK5-NEXT:    movq %r8, 56(%rdx)
; FALLBACK5-NEXT:    movq %r15, 48(%rdx)
; FALLBACK5-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK5-NEXT:    movq %r10, 16(%rdx)
; FALLBACK5-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK5-NEXT:    movq %rax, 32(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK5-NEXT:    movq %r9, (%rdx)
; FALLBACK5-NEXT:    popq %rbx
; FALLBACK5-NEXT:    popq %r14
; FALLBACK5-NEXT:    popq %r15
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: shl_64bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    pushq %rbp
; FALLBACK6-NEXT:    pushq %r15
; FALLBACK6-NEXT:    pushq %r14
; FALLBACK6-NEXT:    pushq %r13
; FALLBACK6-NEXT:    pushq %r12
; FALLBACK6-NEXT:    pushq %rbx
; FALLBACK6-NEXT:    subq $24, %rsp
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK6-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK6-NEXT:    movl (%rsi), %eax
; FALLBACK6-NEXT:    xorps %xmm4, %xmm4
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm3, (%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
; FALLBACK6-NEXT:    andl $56, %ecx
; FALLBACK6-NEXT:    andl $56, %eax
; FALLBACK6-NEXT:    negl %eax
; FALLBACK6-NEXT:    movslq %eax, %rsi
; FALLBACK6-NEXT:    movq -8(%rsp,%rsi), %rax
; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r12
; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %rdi
; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r15
; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r13
; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r8
; FALLBACK6-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; FALLBACK6-NEXT:    movq -32(%rsp,%rsi), %r11
; FALLBACK6-NEXT:    shlxq %rcx, %r11, %r10
; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r14
; FALLBACK6-NEXT:    shlxq %rcx, %r14, %rbx
; FALLBACK6-NEXT:    movl %ecx, %r9d
; FALLBACK6-NEXT:    notb %r9b
; FALLBACK6-NEXT:    shrq %rdi
; FALLBACK6-NEXT:    shrxq %r9, %rdi, %rdi
; FALLBACK6-NEXT:    orq %r12, %rdi
; FALLBACK6-NEXT:    movq (%rsp,%rsi), %rbp
; FALLBACK6-NEXT:    shlxq %rcx, %rbp, %r8
; FALLBACK6-NEXT:    shrq %r13
; FALLBACK6-NEXT:    shrxq %r9, %r13, %r12
; FALLBACK6-NEXT:    orq %r15, %r12
; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %rcx
; FALLBACK6-NEXT:    shrq %r11
; FALLBACK6-NEXT:    shrxq %r9, %r11, %r11
; FALLBACK6-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; FALLBACK6-NEXT:    shrq %r14
; FALLBACK6-NEXT:    shrxq %r9, %r14, %r14
; FALLBACK6-NEXT:    orq %r10, %r14
; FALLBACK6-NEXT:    shrq %rsi
; FALLBACK6-NEXT:    shrxq %r9, %rsi, %rsi
; FALLBACK6-NEXT:    orq %rbx, %rsi
; FALLBACK6-NEXT:    shrq %rax
; FALLBACK6-NEXT:    shrxq %r9, %rax, %rax
; FALLBACK6-NEXT:    orq %r8, %rax
; FALLBACK6-NEXT:    shrq %rbp
; FALLBACK6-NEXT:    shrxq %r9, %rbp, %r8
; FALLBACK6-NEXT:    orq %r15, %r8
; FALLBACK6-NEXT:    movq %rcx, (%rdx)
; FALLBACK6-NEXT:    movq %r8, 56(%rdx)
; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
; FALLBACK6-NEXT:    movq %r14, 16(%rdx)
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %r12, 32(%rdx)
; FALLBACK6-NEXT:    movq %rdi, 40(%rdx)
; FALLBACK6-NEXT:    addq $24, %rsp
; FALLBACK6-NEXT:    popq %rbx
; FALLBACK6-NEXT:    popq %r12
; FALLBACK6-NEXT:    popq %r13
; FALLBACK6-NEXT:    popq %r14
; FALLBACK6-NEXT:    popq %r15
; FALLBACK6-NEXT:    popq %rbp
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: shl_64bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    pushq %r15
; FALLBACK7-NEXT:    pushq %r14
; FALLBACK7-NEXT:    pushq %rbx
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK7-NEXT:    movups 48(%rdi), %xmm3
; FALLBACK7-NEXT:    movl (%rsi), %eax
; FALLBACK7-NEXT:    xorps %xmm4, %xmm4
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
; FALLBACK7-NEXT:    andl $56, %ecx
; FALLBACK7-NEXT:    andl $56, %eax
; FALLBACK7-NEXT:    negl %eax
; FALLBACK7-NEXT:    movslq %eax, %r8
; FALLBACK7-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK7-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK7-NEXT:    movq %r9, %rsi
; FALLBACK7-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK7-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK7-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK7-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK7-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK7-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK7-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK7-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK7-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK7-NEXT:    movq %r14, %r15
; FALLBACK7-NEXT:    shldq %cl, %r9, %r15
; FALLBACK7-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK7-NEXT:    shldq %cl, %r14, %r8
; FALLBACK7-NEXT:    shlxq %rcx, %r11, %r9
; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK7-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK7-NEXT:    movq %r8, 56(%rdx)
; FALLBACK7-NEXT:    movq %r15, 48(%rdx)
; FALLBACK7-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK7-NEXT:    movq %r10, 16(%rdx)
; FALLBACK7-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK7-NEXT:    movq %rax, 32(%rdx)
; FALLBACK7-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK7-NEXT:    movq %r9, (%rdx)
; FALLBACK7-NEXT:    popq %rbx
; FALLBACK7-NEXT:    popq %r14
; FALLBACK7-NEXT:    popq %r15
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: shl_64bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %r15
; FALLBACK8-NEXT:    pushq %r14
; FALLBACK8-NEXT:    pushq %r13
; FALLBACK8-NEXT:    pushq %r12
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK8-NEXT:    movl (%rsi), %ecx
; FALLBACK8-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
; FALLBACK8-NEXT:    andl $56, %eax
; FALLBACK8-NEXT:    andl $56, %ecx
; FALLBACK8-NEXT:    negl %ecx
; FALLBACK8-NEXT:    movslq %ecx, %r9
; FALLBACK8-NEXT:    movq -24(%rsp,%r9), %rdi
; FALLBACK8-NEXT:    movq %rdi, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    movq -32(%rsp,%r9), %r11
; FALLBACK8-NEXT:    movq %r11, %r8
; FALLBACK8-NEXT:    shrq %r8
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r8
; FALLBACK8-NEXT:    orq %r10, %r8
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r11
; FALLBACK8-NEXT:    movq -40(%rsp,%r9), %rbx
; FALLBACK8-NEXT:    movq %rbx, %r10
; FALLBACK8-NEXT:    shrq %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    orq %r11, %r10
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    movq -48(%rsp,%r9), %r15
; FALLBACK8-NEXT:    movq %r15, %r11
; FALLBACK8-NEXT:    shrq %r11
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r11
; FALLBACK8-NEXT:    orq %rbx, %r11
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r15
; FALLBACK8-NEXT:    movq -64(%rsp,%r9), %r14
; FALLBACK8-NEXT:    movq -56(%rsp,%r9), %r12
; FALLBACK8-NEXT:    movq %r12, %rbx
; FALLBACK8-NEXT:    shrq %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rbx
; FALLBACK8-NEXT:    orq %r15, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r12
; FALLBACK8-NEXT:    movq %r14, %r15
; FALLBACK8-NEXT:    shrq %r15
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r15
; FALLBACK8-NEXT:    orq %r12, %r15
; FALLBACK8-NEXT:    movq -16(%rsp,%r9), %r12
; FALLBACK8-NEXT:    movq %r12, %r13
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r13
; FALLBACK8-NEXT:    shrq %rdi
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rdi
; FALLBACK8-NEXT:    orq %r13, %rdi
; FALLBACK8-NEXT:    movq -8(%rsp,%r9), %r9
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r9
; FALLBACK8-NEXT:    shrq %r12
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r12
; FALLBACK8-NEXT:    orq %r9, %r12
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r14
; FALLBACK8-NEXT:    movq %r14, (%rdx)
; FALLBACK8-NEXT:    movq %r12, 56(%rdx)
; FALLBACK8-NEXT:    movq %rdi, 48(%rdx)
; FALLBACK8-NEXT:    movq %r15, 8(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
; FALLBACK8-NEXT:    movq %r10, 32(%rdx)
; FALLBACK8-NEXT:    movq %r8, 40(%rdx)
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    popq %r12
; FALLBACK8-NEXT:    popq %r13
; FALLBACK8-NEXT:    popq %r14
; FALLBACK8-NEXT:    popq %r15
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: shl_64bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    pushq %r15
; FALLBACK9-NEXT:    pushq %r14
; FALLBACK9-NEXT:    pushq %rbx
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK9-NEXT:    movl (%rsi), %eax
; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
; FALLBACK9-NEXT:    andl $56, %ecx
; FALLBACK9-NEXT:    andl $56, %eax
; FALLBACK9-NEXT:    negl %eax
; FALLBACK9-NEXT:    movslq %eax, %r8
; FALLBACK9-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK9-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK9-NEXT:    movq %r9, %rsi
; FALLBACK9-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK9-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK9-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK9-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK9-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK9-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK9-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK9-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK9-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK9-NEXT:    movq %r14, %r15
; FALLBACK9-NEXT:    shldq %cl, %r9, %r15
; FALLBACK9-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK9-NEXT:    shldq %cl, %r14, %r8
; FALLBACK9-NEXT:    movq %r11, %r9
; FALLBACK9-NEXT:    shlq %cl, %r9
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK9-NEXT:    movq %r8, 56(%rdx)
; FALLBACK9-NEXT:    movq %r15, 48(%rdx)
; FALLBACK9-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK9-NEXT:    movq %r10, 16(%rdx)
; FALLBACK9-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK9-NEXT:    movq %rax, 32(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK9-NEXT:    movq %r9, (%rdx)
; FALLBACK9-NEXT:    popq %rbx
; FALLBACK9-NEXT:    popq %r14
; FALLBACK9-NEXT:    popq %r15
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: shl_64bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    pushq %rbp
; FALLBACK10-NEXT:    pushq %r15
; FALLBACK10-NEXT:    pushq %r14
; FALLBACK10-NEXT:    pushq %r13
; FALLBACK10-NEXT:    pushq %r12
; FALLBACK10-NEXT:    pushq %rbx
; FALLBACK10-NEXT:    subq $24, %rsp
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK10-NEXT:    movl (%rsi), %eax
; FALLBACK10-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
; FALLBACK10-NEXT:    andl $56, %ecx
; FALLBACK10-NEXT:    andl $56, %eax
; FALLBACK10-NEXT:    negl %eax
; FALLBACK10-NEXT:    movslq %eax, %rsi
; FALLBACK10-NEXT:    movq -8(%rsp,%rsi), %rax
; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r12
; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %rdi
; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r15
; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r13
; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r8
; FALLBACK10-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; FALLBACK10-NEXT:    movq -32(%rsp,%rsi), %r11
; FALLBACK10-NEXT:    shlxq %rcx, %r11, %r10
; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r14
; FALLBACK10-NEXT:    shlxq %rcx, %r14, %rbx
; FALLBACK10-NEXT:    movl %ecx, %r9d
; FALLBACK10-NEXT:    notb %r9b
; FALLBACK10-NEXT:    shrq %rdi
; FALLBACK10-NEXT:    shrxq %r9, %rdi, %rdi
; FALLBACK10-NEXT:    orq %r12, %rdi
; FALLBACK10-NEXT:    movq (%rsp,%rsi), %rbp
; FALLBACK10-NEXT:    shlxq %rcx, %rbp, %r8
; FALLBACK10-NEXT:    shrq %r13
; FALLBACK10-NEXT:    shrxq %r9, %r13, %r12
; FALLBACK10-NEXT:    orq %r15, %r12
; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %rcx
; FALLBACK10-NEXT:    shrq %r11
; FALLBACK10-NEXT:    shrxq %r9, %r11, %r11
; FALLBACK10-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; FALLBACK10-NEXT:    shrq %r14
; FALLBACK10-NEXT:    shrxq %r9, %r14, %r14
; FALLBACK10-NEXT:    orq %r10, %r14
; FALLBACK10-NEXT:    shrq %rsi
; FALLBACK10-NEXT:    shrxq %r9, %rsi, %rsi
; FALLBACK10-NEXT:    orq %rbx, %rsi
; FALLBACK10-NEXT:    shrq %rax
; FALLBACK10-NEXT:    shrxq %r9, %rax, %rax
; FALLBACK10-NEXT:    orq %r8, %rax
; FALLBACK10-NEXT:    shrq %rbp
; FALLBACK10-NEXT:    shrxq %r9, %rbp, %r8
; FALLBACK10-NEXT:    orq %r15, %r8
; FALLBACK10-NEXT:    movq %rcx, (%rdx)
; FALLBACK10-NEXT:    movq %r8, 56(%rdx)
; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
; FALLBACK10-NEXT:    movq %r14, 16(%rdx)
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %r12, 32(%rdx)
; FALLBACK10-NEXT:    movq %rdi, 40(%rdx)
; FALLBACK10-NEXT:    addq $24, %rsp
; FALLBACK10-NEXT:    popq %rbx
; FALLBACK10-NEXT:    popq %r12
; FALLBACK10-NEXT:    popq %r13
; FALLBACK10-NEXT:    popq %r14
; FALLBACK10-NEXT:    popq %r15
; FALLBACK10-NEXT:    popq %rbp
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: shl_64bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    pushq %r15
; FALLBACK11-NEXT:    pushq %r14
; FALLBACK11-NEXT:    pushq %rbx
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
; FALLBACK11-NEXT:    movl (%rsi), %eax
; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
; FALLBACK11-NEXT:    andl $56, %ecx
; FALLBACK11-NEXT:    andl $56, %eax
; FALLBACK11-NEXT:    negl %eax
; FALLBACK11-NEXT:    movslq %eax, %r8
; FALLBACK11-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK11-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK11-NEXT:    movq %r9, %rsi
; FALLBACK11-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK11-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK11-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK11-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK11-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK11-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK11-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK11-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK11-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK11-NEXT:    movq %r14, %r15
; FALLBACK11-NEXT:    shldq %cl, %r9, %r15
; FALLBACK11-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK11-NEXT:    shldq %cl, %r14, %r8
; FALLBACK11-NEXT:    shlxq %rcx, %r11, %r9
; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK11-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK11-NEXT:    movq %r8, 56(%rdx)
; FALLBACK11-NEXT:    movq %r15, 48(%rdx)
; FALLBACK11-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK11-NEXT:    movq %r10, 16(%rdx)
; FALLBACK11-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK11-NEXT:    movq %rax, 32(%rdx)
; FALLBACK11-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK11-NEXT:    movq %r9, (%rdx)
; FALLBACK11-NEXT:    popq %rbx
; FALLBACK11-NEXT:    popq %r14
; FALLBACK11-NEXT:    popq %r15
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: shl_64bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %r15
; FALLBACK12-NEXT:    pushq %r14
; FALLBACK12-NEXT:    pushq %r13
; FALLBACK12-NEXT:    pushq %r12
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK12-NEXT:    movl (%rsi), %ecx
; FALLBACK12-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
; FALLBACK12-NEXT:    andl $56, %eax
; FALLBACK12-NEXT:    andl $56, %ecx
; FALLBACK12-NEXT:    negl %ecx
; FALLBACK12-NEXT:    movslq %ecx, %r9
; FALLBACK12-NEXT:    movq -24(%rsp,%r9), %rdi
; FALLBACK12-NEXT:    movq %rdi, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    movq -32(%rsp,%r9), %r11
; FALLBACK12-NEXT:    movq %r11, %r8
; FALLBACK12-NEXT:    shrq %r8
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r8
; FALLBACK12-NEXT:    orq %r10, %r8
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r11
; FALLBACK12-NEXT:    movq -40(%rsp,%r9), %rbx
; FALLBACK12-NEXT:    movq %rbx, %r10
; FALLBACK12-NEXT:    shrq %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    orq %r11, %r10
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    movq -48(%rsp,%r9), %r15
; FALLBACK12-NEXT:    movq %r15, %r11
; FALLBACK12-NEXT:    shrq %r11
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r11
; FALLBACK12-NEXT:    orq %rbx, %r11
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r15
; FALLBACK12-NEXT:    movq -64(%rsp,%r9), %r14
; FALLBACK12-NEXT:    movq -56(%rsp,%r9), %r12
; FALLBACK12-NEXT:    movq %r12, %rbx
; FALLBACK12-NEXT:    shrq %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rbx
; FALLBACK12-NEXT:    orq %r15, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r12
; FALLBACK12-NEXT:    movq %r14, %r15
; FALLBACK12-NEXT:    shrq %r15
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r15
; FALLBACK12-NEXT:    orq %r12, %r15
; FALLBACK12-NEXT:    movq -16(%rsp,%r9), %r12
; FALLBACK12-NEXT:    movq %r12, %r13
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r13
; FALLBACK12-NEXT:    shrq %rdi
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rdi
; FALLBACK12-NEXT:    orq %r13, %rdi
; FALLBACK12-NEXT:    movq -8(%rsp,%r9), %r9
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r9
; FALLBACK12-NEXT:    shrq %r12
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r12
; FALLBACK12-NEXT:    orq %r9, %r12
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r14
; FALLBACK12-NEXT:    movq %r14, (%rdx)
; FALLBACK12-NEXT:    movq %r12, 56(%rdx)
; FALLBACK12-NEXT:    movq %rdi, 48(%rdx)
; FALLBACK12-NEXT:    movq %r15, 8(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 16(%rdx)
; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
; FALLBACK12-NEXT:    movq %r10, 32(%rdx)
; FALLBACK12-NEXT:    movq %r8, 40(%rdx)
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    popq %r12
; FALLBACK12-NEXT:    popq %r13
; FALLBACK12-NEXT:    popq %r14
; FALLBACK12-NEXT:    popq %r15
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: shl_64bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    pushq %r15
; FALLBACK13-NEXT:    pushq %r14
; FALLBACK13-NEXT:    pushq %rbx
; FALLBACK13-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK13-NEXT:    movl (%rsi), %eax
; FALLBACK13-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
; FALLBACK13-NEXT:    andl $56, %ecx
; FALLBACK13-NEXT:    andl $56, %eax
; FALLBACK13-NEXT:    negl %eax
; FALLBACK13-NEXT:    movslq %eax, %r8
; FALLBACK13-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK13-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK13-NEXT:    movq %r9, %rsi
; FALLBACK13-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK13-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK13-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK13-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK13-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK13-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK13-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK13-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK13-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK13-NEXT:    movq %r14, %r15
; FALLBACK13-NEXT:    shldq %cl, %r9, %r15
; FALLBACK13-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK13-NEXT:    shldq %cl, %r14, %r8
; FALLBACK13-NEXT:    movq %r11, %r9
; FALLBACK13-NEXT:    shlq %cl, %r9
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK13-NEXT:    movq %r8, 56(%rdx)
; FALLBACK13-NEXT:    movq %r15, 48(%rdx)
; FALLBACK13-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK13-NEXT:    movq %r10, 16(%rdx)
; FALLBACK13-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK13-NEXT:    movq %rax, 32(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK13-NEXT:    movq %r9, (%rdx)
; FALLBACK13-NEXT:    popq %rbx
; FALLBACK13-NEXT:    popq %r14
; FALLBACK13-NEXT:    popq %r15
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: shl_64bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    pushq %rbp
; FALLBACK14-NEXT:    pushq %r15
; FALLBACK14-NEXT:    pushq %r14
; FALLBACK14-NEXT:    pushq %r13
; FALLBACK14-NEXT:    pushq %r12
; FALLBACK14-NEXT:    pushq %rbx
; FALLBACK14-NEXT:    subq $24, %rsp
; FALLBACK14-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK14-NEXT:    movl (%rsi), %eax
; FALLBACK14-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
; FALLBACK14-NEXT:    andl $56, %ecx
; FALLBACK14-NEXT:    andl $56, %eax
; FALLBACK14-NEXT:    negl %eax
; FALLBACK14-NEXT:    movslq %eax, %rsi
; FALLBACK14-NEXT:    movq -8(%rsp,%rsi), %rax
; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r12
; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %rdi
; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r15
; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r13
; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r8
; FALLBACK14-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; FALLBACK14-NEXT:    movq -32(%rsp,%rsi), %r11
; FALLBACK14-NEXT:    shlxq %rcx, %r11, %r10
; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r14
; FALLBACK14-NEXT:    shlxq %rcx, %r14, %rbx
; FALLBACK14-NEXT:    movl %ecx, %r9d
; FALLBACK14-NEXT:    notb %r9b
; FALLBACK14-NEXT:    shrq %rdi
; FALLBACK14-NEXT:    shrxq %r9, %rdi, %rdi
; FALLBACK14-NEXT:    orq %r12, %rdi
; FALLBACK14-NEXT:    movq (%rsp,%rsi), %rbp
; FALLBACK14-NEXT:    shlxq %rcx, %rbp, %r8
; FALLBACK14-NEXT:    shrq %r13
; FALLBACK14-NEXT:    shrxq %r9, %r13, %r12
; FALLBACK14-NEXT:    orq %r15, %r12
; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %rcx
; FALLBACK14-NEXT:    shrq %r11
; FALLBACK14-NEXT:    shrxq %r9, %r11, %r11
; FALLBACK14-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; FALLBACK14-NEXT:    shrq %r14
; FALLBACK14-NEXT:    shrxq %r9, %r14, %r14
; FALLBACK14-NEXT:    orq %r10, %r14
; FALLBACK14-NEXT:    shrq %rsi
; FALLBACK14-NEXT:    shrxq %r9, %rsi, %rsi
; FALLBACK14-NEXT:    orq %rbx, %rsi
; FALLBACK14-NEXT:    shrq %rax
; FALLBACK14-NEXT:    shrxq %r9, %rax, %rax
; FALLBACK14-NEXT:    orq %r8, %rax
; FALLBACK14-NEXT:    shrq %rbp
; FALLBACK14-NEXT:    shrxq %r9, %rbp, %r8
; FALLBACK14-NEXT:    orq %r15, %r8
; FALLBACK14-NEXT:    movq %rcx, (%rdx)
; FALLBACK14-NEXT:    movq %r8, 56(%rdx)
; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
; FALLBACK14-NEXT:    movq %r14, 16(%rdx)
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %r12, 32(%rdx)
; FALLBACK14-NEXT:    movq %rdi, 40(%rdx)
; FALLBACK14-NEXT:    addq $24, %rsp
; FALLBACK14-NEXT:    popq %rbx
; FALLBACK14-NEXT:    popq %r12
; FALLBACK14-NEXT:    popq %r13
; FALLBACK14-NEXT:    popq %r14
; FALLBACK14-NEXT:    popq %r15
; FALLBACK14-NEXT:    popq %rbp
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: shl_64bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    pushq %r15
; FALLBACK15-NEXT:    pushq %r14
; FALLBACK15-NEXT:    pushq %rbx
; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
; FALLBACK15-NEXT:    movl (%rsi), %eax
; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
; FALLBACK15-NEXT:    andl $56, %ecx
; FALLBACK15-NEXT:    andl $56, %eax
; FALLBACK15-NEXT:    negl %eax
; FALLBACK15-NEXT:    movslq %eax, %r8
; FALLBACK15-NEXT:    movq -32(%rsp,%r8), %rax
; FALLBACK15-NEXT:    movq -24(%rsp,%r8), %r9
; FALLBACK15-NEXT:    movq %r9, %rsi
; FALLBACK15-NEXT:    shldq %cl, %rax, %rsi
; FALLBACK15-NEXT:    movq -40(%rsp,%r8), %rdi
; FALLBACK15-NEXT:    shldq %cl, %rdi, %rax
; FALLBACK15-NEXT:    movq -48(%rsp,%r8), %r10
; FALLBACK15-NEXT:    shldq %cl, %r10, %rdi
; FALLBACK15-NEXT:    movq -64(%rsp,%r8), %r11
; FALLBACK15-NEXT:    movq -56(%rsp,%r8), %rbx
; FALLBACK15-NEXT:    shldq %cl, %rbx, %r10
; FALLBACK15-NEXT:    movq -16(%rsp,%r8), %r14
; FALLBACK15-NEXT:    movq %r14, %r15
; FALLBACK15-NEXT:    shldq %cl, %r9, %r15
; FALLBACK15-NEXT:    movq -8(%rsp,%r8), %r8
; FALLBACK15-NEXT:    shldq %cl, %r14, %r8
; FALLBACK15-NEXT:    shlxq %rcx, %r11, %r9
; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK15-NEXT:    shldq %cl, %r11, %rbx
; FALLBACK15-NEXT:    movq %r8, 56(%rdx)
; FALLBACK15-NEXT:    movq %r15, 48(%rdx)
; FALLBACK15-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK15-NEXT:    movq %r10, 16(%rdx)
; FALLBACK15-NEXT:    movq %rdi, 24(%rdx)
; FALLBACK15-NEXT:    movq %rax, 32(%rdx)
; FALLBACK15-NEXT:    movq %rsi, 40(%rdx)
; FALLBACK15-NEXT:    movq %r9, (%rdx)
; FALLBACK15-NEXT:    popq %rbx
; FALLBACK15-NEXT:    popq %r14
; FALLBACK15-NEXT:    popq %r15
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: shl_64bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $204, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl (%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 12(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 16(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 20(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 24(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 28(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 32(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 36(%eax), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 40(%eax), %ebp
; FALLBACK16-NEXT:    movl 44(%eax), %ebx
; FALLBACK16-NEXT:    movl 48(%eax), %edi
; FALLBACK16-NEXT:    movl 52(%eax), %esi
; FALLBACK16-NEXT:    movl 56(%eax), %edx
; FALLBACK16-NEXT:    movl 60(%eax), %ecx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl (%eax), %eax
; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %eax, %edx
; FALLBACK16-NEXT:    andl $60, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    subl %edx, %ecx
; FALLBACK16-NEXT:    movl (%ecx), %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%ecx), %edx
; FALLBACK16-NEXT:    movl %ecx, %ebp
; FALLBACK16-NEXT:    shll $3, %eax
; FALLBACK16-NEXT:    andl $24, %eax
; FALLBACK16-NEXT:    movl %edx, %esi
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %al, %ch
; FALLBACK16-NEXT:    notb %ch
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %esi, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 12(%ebp), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl 8(%ebp), %esi
; FALLBACK16-NEXT:    movl %ebp, %edi
; FALLBACK16-NEXT:    movl %esi, %ebp
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %ebx, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %esi, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %edi, %ebp
; FALLBACK16-NEXT:    movl 20(%edi), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl 16(%edi), %esi
; FALLBACK16-NEXT:    movl %esi, %edx
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %ebx, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %esi, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %ebp, %edx
; FALLBACK16-NEXT:    movl 28(%ebp), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl 24(%ebp), %esi
; FALLBACK16-NEXT:    movl %esi, %edi
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %ebx, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %esi, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 36(%edx), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl 32(%edx), %esi
; FALLBACK16-NEXT:    movl %edx, %ebp
; FALLBACK16-NEXT:    movl %esi, %edi
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %ebx, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %esi, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 44(%ebp), %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl 40(%ebp), %esi
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %esi, %edx
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %ebx, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %esi, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 52(%ebp), %esi
; FALLBACK16-NEXT:    movl %esi, %edi
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    negl %edx
; FALLBACK16-NEXT:    movl 176(%esp,%edx), %ebx
; FALLBACK16-NEXT:    movl %ebx, %ebp
; FALLBACK16-NEXT:    shrl %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %ebp
; FALLBACK16-NEXT:    orl %edi, %ebp
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shrl %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    orl %ebx, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK16-NEXT:    movl 60(%edi), %edx
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    movl 56(%edi), %ebx
; FALLBACK16-NEXT:    movl %ebx, %edi
; FALLBACK16-NEXT:    shrl %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    orl %edx, %edi
; FALLBACK16-NEXT:    movb %al, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    shrl %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    orl %ebx, %esi
; FALLBACK16-NEXT:    movl %eax, %ecx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl %edx, (%eax)
; FALLBACK16-NEXT:    movl %esi, 56(%eax)
; FALLBACK16-NEXT:    movl %edi, 60(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 48(%eax)
; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 40(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
; FALLBACK16-NEXT:    addl $204, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: shl_64bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $188, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 12(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 16(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 20(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 24(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 28(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 36(%ecx), %eax
; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%ecx), %ebp
; FALLBACK17-NEXT:    movl 44(%ecx), %ebx
; FALLBACK17-NEXT:    movl 48(%ecx), %edi
; FALLBACK17-NEXT:    movl 52(%ecx), %esi
; FALLBACK17-NEXT:    movl 56(%ecx), %edx
; FALLBACK17-NEXT:    movl 60(%ecx), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %ecx
; FALLBACK17-NEXT:    xorps %xmm0, %xmm0
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ecx, %ebp
; FALLBACK17-NEXT:    andl $60, %ebp
; FALLBACK17-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    subl %ebp, %eax
; FALLBACK17-NEXT:    movl 8(%eax), %esi
; FALLBACK17-NEXT:    movl 12(%eax), %edx
; FALLBACK17-NEXT:    shll $3, %ecx
; FALLBACK17-NEXT:    andl $24, %ecx
; FALLBACK17-NEXT:    movl %edx, %edi
; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%eax), %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %edi, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 16(%eax), %edi
; FALLBACK17-NEXT:    movl 20(%eax), %esi
; FALLBACK17-NEXT:    movl %esi, %ebx
; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 24(%eax), %edi
; FALLBACK17-NEXT:    movl 28(%eax), %edx
; FALLBACK17-NEXT:    movl %edx, %ebx
; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%eax), %edi
; FALLBACK17-NEXT:    movl 36(%eax), %esi
; FALLBACK17-NEXT:    movl %esi, %ebx
; FALLBACK17-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%eax), %edx
; FALLBACK17-NEXT:    movl 44(%eax), %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 56(%eax), %edx
; FALLBACK17-NEXT:    movl 60(%eax), %edi
; FALLBACK17-NEXT:    shldl %cl, %edx, %edi
; FALLBACK17-NEXT:    movl (%eax), %ebx
; FALLBACK17-NEXT:    movl 52(%eax), %esi
; FALLBACK17-NEXT:    shldl %cl, %esi, %edx
; FALLBACK17-NEXT:    negl %ebp
; FALLBACK17-NEXT:    movl 160(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
; FALLBACK17-NEXT:    movl %edi, 60(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK17-NEXT:    shll %cl, %ebx
; FALLBACK17-NEXT:    shldl %cl, %eax, %esi
; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK17-NEXT:    shldl %cl, %edi, %eax
; FALLBACK17-NEXT:    movl %eax, 48(%ebp)
; FALLBACK17-NEXT:    movl %esi, 52(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
; FALLBACK17-NEXT:    movl %ebx, (%ebp)
; FALLBACK17-NEXT:    movl %edx, 4(%ebp)
; FALLBACK17-NEXT:    addl $188, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: shl_64bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $204, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 12(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 16(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 20(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 24(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 28(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 32(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 36(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 40(%eax), %ebx
; FALLBACK18-NEXT:    movl 44(%eax), %edi
; FALLBACK18-NEXT:    movl 48(%eax), %esi
; FALLBACK18-NEXT:    movl 52(%eax), %edx
; FALLBACK18-NEXT:    movl 56(%eax), %ecx
; FALLBACK18-NEXT:    movl 60(%eax), %eax
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK18-NEXT:    movl (%ebp), %ebp
; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
; FALLBACK18-NEXT:    andl $24, %edx
; FALLBACK18-NEXT:    andl $60, %ebp
; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
; FALLBACK18-NEXT:    subl %ebp, %edi
; FALLBACK18-NEXT:    movl (%edi), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%edi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl %edx, %ebx
; FALLBACK18-NEXT:    notb %bl
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
; FALLBACK18-NEXT:    shlxl %edx, %eax, %ecx
; FALLBACK18-NEXT:    orl %ecx, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%edi), %esi
; FALLBACK18-NEXT:    movl %esi, %ecx
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK18-NEXT:    movl 12(%edi), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    shrl %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK18-NEXT:    orl %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 16(%edi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK18-NEXT:    movl 20(%edi), %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK18-NEXT:    orl %eax, %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 24(%edi), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK18-NEXT:    movl 28(%edi), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %esi
; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK18-NEXT:    orl %eax, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 32(%edi), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK18-NEXT:    movl 36(%edi), %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK18-NEXT:    orl %eax, %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 40(%edi), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK18-NEXT:    movl 44(%edi), %ecx
; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %esi
; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK18-NEXT:    orl %eax, %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 48(%edi), %esi
; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrl %esi
; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
; FALLBACK18-NEXT:    movl 52(%edi), %esi
; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrl %ecx
; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
; FALLBACK18-NEXT:    orl %eax, %ebp
; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK18-NEXT:    negl %eax
; FALLBACK18-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
; FALLBACK18-NEXT:    movl 56(%edi), %eax
; FALLBACK18-NEXT:    shlxl %edx, %eax, %edx
; FALLBACK18-NEXT:    shrl %esi
; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK18-NEXT:    orl %edx, %esi
; FALLBACK18-NEXT:    shrl %eax
; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK18-NEXT:    orl %eax, %ecx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, (%eax)
; FALLBACK18-NEXT:    movl %esi, 56(%eax)
; FALLBACK18-NEXT:    movl %ecx, 60(%eax)
; FALLBACK18-NEXT:    movl %ebp, 48(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 40(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $204, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: shl_64bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $204, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl (%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 12(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 16(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 20(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 24(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 28(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 36(%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%ebp), %ebx
; FALLBACK19-NEXT:    movl 44(%ebp), %edi
; FALLBACK19-NEXT:    movl 48(%ebp), %esi
; FALLBACK19-NEXT:    movl 52(%ebp), %edx
; FALLBACK19-NEXT:    movl 56(%ebp), %ecx
; FALLBACK19-NEXT:    movl 60(%ebp), %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl (%ebp), %ebp
; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    leal (,%ebp,8), %ecx
; FALLBACK19-NEXT:    andl $24, %ecx
; FALLBACK19-NEXT:    andl $60, %ebp
; FALLBACK19-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK19-NEXT:    subl %ebp, %eax
; FALLBACK19-NEXT:    movl 4(%eax), %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%eax), %edi
; FALLBACK19-NEXT:    movl 12(%eax), %edx
; FALLBACK19-NEXT:    movl %edx, %ebx
; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 16(%eax), %edi
; FALLBACK19-NEXT:    movl 20(%eax), %esi
; FALLBACK19-NEXT:    movl %esi, %ebx
; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 24(%eax), %edi
; FALLBACK19-NEXT:    movl 28(%eax), %edx
; FALLBACK19-NEXT:    movl %edx, %ebx
; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %esi, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%eax), %edi
; FALLBACK19-NEXT:    movl 36(%eax), %esi
; FALLBACK19-NEXT:    movl %esi, %ebx
; FALLBACK19-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK19-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%eax), %ebx
; FALLBACK19-NEXT:    movl 44(%eax), %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK19-NEXT:    movl 56(%eax), %edx
; FALLBACK19-NEXT:    movl 60(%eax), %edi
; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl (%eax), %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 52(%eax), %esi
; FALLBACK19-NEXT:    shldl %cl, %esi, %edx
; FALLBACK19-NEXT:    negl %ebp
; FALLBACK19-NEXT:    movl 176(%esp,%ebp), %ebp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK19-NEXT:    movl %edx, 56(%eax)
; FALLBACK19-NEXT:    movl %edi, 60(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    shlxl %ecx, %edx, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK19-NEXT:    shldl %cl, %edx, %edi
; FALLBACK19-NEXT:    shldl %cl, %ebp, %esi
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    shldl %cl, %edx, %ebp
; FALLBACK19-NEXT:    movl %ebp, 48(%eax)
; FALLBACK19-NEXT:    movl %esi, 52(%eax)
; FALLBACK19-NEXT:    movl %ebx, 40(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 44(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 32(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 36(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 24(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 28(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 16(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 20(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 8(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 12(%eax)
; FALLBACK19-NEXT:    movl %edi, 4(%eax)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, (%eax)
; FALLBACK19-NEXT:    addl $204, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: shl_64bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $204, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK20-NEXT:    movl (%eax), %eax
; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    andl $60, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    subl %edx, %ecx
; FALLBACK20-NEXT:    movl (%ecx), %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 4(%ecx), %edx
; FALLBACK20-NEXT:    movl %ecx, %ebp
; FALLBACK20-NEXT:    shll $3, %eax
; FALLBACK20-NEXT:    andl $24, %eax
; FALLBACK20-NEXT:    movl %edx, %esi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movb %al, %ch
; FALLBACK20-NEXT:    notb %ch
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %esi, %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 12(%ebp), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl 8(%ebp), %esi
; FALLBACK20-NEXT:    movl %ebp, %edi
; FALLBACK20-NEXT:    movl %esi, %ebp
; FALLBACK20-NEXT:    shrl %ebp
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    orl %ebx, %ebp
; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %esi, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %edi, %ebp
; FALLBACK20-NEXT:    movl 20(%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl 16(%edi), %esi
; FALLBACK20-NEXT:    movl %esi, %edx
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %ebx, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %esi, %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %ebp, %edx
; FALLBACK20-NEXT:    movl 28(%ebp), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl 24(%ebp), %esi
; FALLBACK20-NEXT:    movl %esi, %edi
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %ebx, %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK20-NEXT:    shrl %ebp
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    orl %esi, %ebp
; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 36(%edx), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl 32(%edx), %esi
; FALLBACK20-NEXT:    movl %edx, %ebp
; FALLBACK20-NEXT:    movl %esi, %edi
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %ebx, %edi
; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %esi, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 44(%ebp), %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl 40(%ebp), %esi
; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl %esi, %edx
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %ebx, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %esi, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 52(%ebp), %esi
; FALLBACK20-NEXT:    movl %esi, %edi
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    negl %edx
; FALLBACK20-NEXT:    movl 176(%esp,%edx), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    shrl %ebp
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    orl %edi, %ebp
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    orl %ebx, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK20-NEXT:    movl 60(%edi), %edx
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    movl 56(%edi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %edi
; FALLBACK20-NEXT:    shrl %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    orl %edx, %edi
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    shrl %esi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shrl %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl %edx, (%eax)
; FALLBACK20-NEXT:    movl %esi, 56(%eax)
; FALLBACK20-NEXT:    movl %edi, 60(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 48(%eax)
; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 40(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
; FALLBACK20-NEXT:    addl $204, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: shl_64bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $188, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movups (%ecx), %xmm0
; FALLBACK21-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK21-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK21-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK21-NEXT:    movl (%eax), %ecx
; FALLBACK21-NEXT:    xorps %xmm4, %xmm4
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %ecx, %ebp
; FALLBACK21-NEXT:    andl $60, %ebp
; FALLBACK21-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    subl %ebp, %eax
; FALLBACK21-NEXT:    movl 8(%eax), %esi
; FALLBACK21-NEXT:    movl 12(%eax), %edx
; FALLBACK21-NEXT:    shll $3, %ecx
; FALLBACK21-NEXT:    andl $24, %ecx
; FALLBACK21-NEXT:    movl %edx, %edi
; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 4(%eax), %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %edi, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 16(%eax), %edi
; FALLBACK21-NEXT:    movl 20(%eax), %esi
; FALLBACK21-NEXT:    movl %esi, %ebx
; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 24(%eax), %edi
; FALLBACK21-NEXT:    movl 28(%eax), %edx
; FALLBACK21-NEXT:    movl %edx, %ebx
; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 32(%eax), %edi
; FALLBACK21-NEXT:    movl 36(%eax), %esi
; FALLBACK21-NEXT:    movl %esi, %ebx
; FALLBACK21-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 40(%eax), %edx
; FALLBACK21-NEXT:    movl 44(%eax), %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK21-NEXT:    movl 56(%eax), %edx
; FALLBACK21-NEXT:    movl 60(%eax), %edi
; FALLBACK21-NEXT:    shldl %cl, %edx, %edi
; FALLBACK21-NEXT:    movl (%eax), %ebx
; FALLBACK21-NEXT:    movl 52(%eax), %esi
; FALLBACK21-NEXT:    shldl %cl, %esi, %edx
; FALLBACK21-NEXT:    negl %ebp
; FALLBACK21-NEXT:    movl 160(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
; FALLBACK21-NEXT:    movl %edi, 60(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK21-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK21-NEXT:    shll %cl, %ebx
; FALLBACK21-NEXT:    shldl %cl, %eax, %esi
; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK21-NEXT:    shldl %cl, %edi, %eax
; FALLBACK21-NEXT:    movl %eax, 48(%ebp)
; FALLBACK21-NEXT:    movl %esi, 52(%ebp)
; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
; FALLBACK21-NEXT:    movl %ebx, (%ebp)
; FALLBACK21-NEXT:    movl %edx, 4(%ebp)
; FALLBACK21-NEXT:    addl $188, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: shl_64bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $204, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK22-NEXT:    movl (%eax), %eax
; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    leal (,%eax,8), %edx
; FALLBACK22-NEXT:    andl $24, %edx
; FALLBACK22-NEXT:    andl $60, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
; FALLBACK22-NEXT:    subl %eax, %edi
; FALLBACK22-NEXT:    movl (%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 4(%edi), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl %edx, %ebx
; FALLBACK22-NEXT:    notb %bl
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
; FALLBACK22-NEXT:    shlxl %edx, %eax, %ecx
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 8(%edi), %esi
; FALLBACK22-NEXT:    movl %esi, %ecx
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK22-NEXT:    movl 12(%edi), %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    shrl %eax
; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 16(%edi), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrl %eax
; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK22-NEXT:    movl 20(%edi), %esi
; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %eax, %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 24(%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK22-NEXT:    movl 28(%edi), %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %eax, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 32(%edi), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrl %eax
; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK22-NEXT:    movl 36(%edi), %esi
; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    orl %eax, %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 40(%edi), %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK22-NEXT:    movl 44(%edi), %ecx
; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %eax, %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 48(%edi), %esi
; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
; FALLBACK22-NEXT:    movl 52(%edi), %esi
; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrl %ecx
; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
; FALLBACK22-NEXT:    orl %eax, %ebp
; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK22-NEXT:    negl %eax
; FALLBACK22-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
; FALLBACK22-NEXT:    movl 56(%edi), %eax
; FALLBACK22-NEXT:    shlxl %edx, %eax, %edx
; FALLBACK22-NEXT:    shrl %esi
; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %edx, %esi
; FALLBACK22-NEXT:    shrl %eax
; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK22-NEXT:    orl %eax, %ecx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK22-NEXT:    movl %edx, (%eax)
; FALLBACK22-NEXT:    movl %esi, 56(%eax)
; FALLBACK22-NEXT:    movl %ecx, 60(%eax)
; FALLBACK22-NEXT:    movl %ebp, 48(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
; FALLBACK22-NEXT:    addl $204, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: shl_64bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $204, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movups (%ecx), %xmm0
; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
; FALLBACK23-NEXT:    movl (%eax), %ebp
; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    leal (,%ebp,8), %ecx
; FALLBACK23-NEXT:    andl $24, %ecx
; FALLBACK23-NEXT:    andl $60, %ebp
; FALLBACK23-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    subl %ebp, %eax
; FALLBACK23-NEXT:    movl 4(%eax), %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 8(%eax), %edi
; FALLBACK23-NEXT:    movl 12(%eax), %edx
; FALLBACK23-NEXT:    movl %edx, %ebx
; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 16(%eax), %edi
; FALLBACK23-NEXT:    movl 20(%eax), %esi
; FALLBACK23-NEXT:    movl %esi, %ebx
; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 24(%eax), %edi
; FALLBACK23-NEXT:    movl 28(%eax), %edx
; FALLBACK23-NEXT:    movl %edx, %ebx
; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 32(%eax), %edi
; FALLBACK23-NEXT:    movl 36(%eax), %esi
; FALLBACK23-NEXT:    movl %esi, %ebx
; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 40(%eax), %ebx
; FALLBACK23-NEXT:    movl 44(%eax), %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shldl %cl, %esi, %ebx
; FALLBACK23-NEXT:    movl 56(%eax), %edx
; FALLBACK23-NEXT:    movl 60(%eax), %edi
; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl (%eax), %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 52(%eax), %esi
; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
; FALLBACK23-NEXT:    negl %ebp
; FALLBACK23-NEXT:    movl 176(%esp,%ebp), %ebp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movl %edx, 56(%eax)
; FALLBACK23-NEXT:    movl %edi, 60(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK23-NEXT:    shlxl %ecx, %edx, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
; FALLBACK23-NEXT:    shldl %cl, %ebp, %esi
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK23-NEXT:    shldl %cl, %edx, %ebp
; FALLBACK23-NEXT:    movl %ebp, 48(%eax)
; FALLBACK23-NEXT:    movl %esi, 52(%eax)
; FALLBACK23-NEXT:    movl %ebx, 40(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 44(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 32(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 36(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 24(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 28(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 16(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 20(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 8(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 12(%eax)
; FALLBACK23-NEXT:    movl %edi, 4(%eax)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, (%eax)
; FALLBACK23-NEXT:    addl $204, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: shl_64bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $204, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK24-NEXT:    movl (%eax), %eax
; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    andl $60, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    subl %edx, %ecx
; FALLBACK24-NEXT:    movl (%ecx), %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 4(%ecx), %edx
; FALLBACK24-NEXT:    movl %ecx, %ebp
; FALLBACK24-NEXT:    shll $3, %eax
; FALLBACK24-NEXT:    andl $24, %eax
; FALLBACK24-NEXT:    movl %edx, %esi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movb %al, %ch
; FALLBACK24-NEXT:    notb %ch
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %esi, %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 12(%ebp), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl 8(%ebp), %esi
; FALLBACK24-NEXT:    movl %ebp, %edi
; FALLBACK24-NEXT:    movl %esi, %ebp
; FALLBACK24-NEXT:    shrl %ebp
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    orl %ebx, %ebp
; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %esi, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %edi, %ebp
; FALLBACK24-NEXT:    movl 20(%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl 16(%edi), %esi
; FALLBACK24-NEXT:    movl %esi, %edx
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %ebx, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %esi, %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %ebp, %edx
; FALLBACK24-NEXT:    movl 28(%ebp), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl 24(%ebp), %esi
; FALLBACK24-NEXT:    movl %esi, %edi
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %ebx, %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK24-NEXT:    shrl %ebp
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    orl %esi, %ebp
; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 36(%edx), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl 32(%edx), %esi
; FALLBACK24-NEXT:    movl %edx, %ebp
; FALLBACK24-NEXT:    movl %esi, %edi
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %ebx, %edi
; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %esi, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 44(%ebp), %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl 40(%ebp), %esi
; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl %esi, %edx
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %ebx, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %esi, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 52(%ebp), %esi
; FALLBACK24-NEXT:    movl %esi, %edi
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    negl %edx
; FALLBACK24-NEXT:    movl 176(%esp,%edx), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    shrl %ebp
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    orl %edi, %ebp
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    orl %ebx, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK24-NEXT:    movl 60(%edi), %edx
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    movl 56(%edi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %edi
; FALLBACK24-NEXT:    shrl %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    orl %edx, %edi
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    shrl %esi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shrl %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl %edx, (%eax)
; FALLBACK24-NEXT:    movl %esi, 56(%eax)
; FALLBACK24-NEXT:    movl %edi, 60(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 48(%eax)
; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 40(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
; FALLBACK24-NEXT:    addl $204, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    vzeroupper
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: shl_64bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $188, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK25-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK25-NEXT:    movl (%eax), %ecx
; FALLBACK25-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %ecx, %ebp
; FALLBACK25-NEXT:    andl $60, %ebp
; FALLBACK25-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    subl %ebp, %eax
; FALLBACK25-NEXT:    movl 8(%eax), %esi
; FALLBACK25-NEXT:    movl 12(%eax), %edx
; FALLBACK25-NEXT:    shll $3, %ecx
; FALLBACK25-NEXT:    andl $24, %ecx
; FALLBACK25-NEXT:    movl %edx, %edi
; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 4(%eax), %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %edi, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 16(%eax), %edi
; FALLBACK25-NEXT:    movl 20(%eax), %esi
; FALLBACK25-NEXT:    movl %esi, %ebx
; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 24(%eax), %edi
; FALLBACK25-NEXT:    movl 28(%eax), %edx
; FALLBACK25-NEXT:    movl %edx, %ebx
; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 32(%eax), %edi
; FALLBACK25-NEXT:    movl 36(%eax), %esi
; FALLBACK25-NEXT:    movl %esi, %ebx
; FALLBACK25-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 40(%eax), %edx
; FALLBACK25-NEXT:    movl 44(%eax), %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK25-NEXT:    movl 56(%eax), %edx
; FALLBACK25-NEXT:    movl 60(%eax), %edi
; FALLBACK25-NEXT:    shldl %cl, %edx, %edi
; FALLBACK25-NEXT:    movl (%eax), %ebx
; FALLBACK25-NEXT:    movl 52(%eax), %esi
; FALLBACK25-NEXT:    shldl %cl, %esi, %edx
; FALLBACK25-NEXT:    negl %ebp
; FALLBACK25-NEXT:    movl 160(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
; FALLBACK25-NEXT:    movl %edi, 60(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK25-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK25-NEXT:    shll %cl, %ebx
; FALLBACK25-NEXT:    shldl %cl, %eax, %esi
; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK25-NEXT:    shldl %cl, %edi, %eax
; FALLBACK25-NEXT:    movl %eax, 48(%ebp)
; FALLBACK25-NEXT:    movl %esi, 52(%ebp)
; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
; FALLBACK25-NEXT:    movl %ebx, (%ebp)
; FALLBACK25-NEXT:    movl %edx, 4(%ebp)
; FALLBACK25-NEXT:    addl $188, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    vzeroupper
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: shl_64bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $204, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK26-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK26-NEXT:    movl (%eax), %eax
; FALLBACK26-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    leal (,%eax,8), %edx
; FALLBACK26-NEXT:    andl $24, %edx
; FALLBACK26-NEXT:    andl $60, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
; FALLBACK26-NEXT:    subl %eax, %edi
; FALLBACK26-NEXT:    movl (%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 4(%edi), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl %edx, %ebx
; FALLBACK26-NEXT:    notb %bl
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
; FALLBACK26-NEXT:    shlxl %edx, %eax, %ecx
; FALLBACK26-NEXT:    orl %ecx, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 8(%edi), %esi
; FALLBACK26-NEXT:    movl %esi, %ecx
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK26-NEXT:    movl 12(%edi), %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    shrl %eax
; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 16(%edi), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrl %eax
; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK26-NEXT:    movl 20(%edi), %esi
; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %eax, %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 24(%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK26-NEXT:    movl 28(%edi), %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %eax, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 32(%edi), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrl %eax
; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK26-NEXT:    movl 36(%edi), %esi
; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK26-NEXT:    orl %eax, %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 40(%edi), %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK26-NEXT:    movl 44(%edi), %ecx
; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %eax, %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 48(%edi), %esi
; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
; FALLBACK26-NEXT:    movl 52(%edi), %esi
; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrl %ecx
; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
; FALLBACK26-NEXT:    orl %eax, %ebp
; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK26-NEXT:    negl %eax
; FALLBACK26-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
; FALLBACK26-NEXT:    movl 56(%edi), %eax
; FALLBACK26-NEXT:    shlxl %edx, %eax, %edx
; FALLBACK26-NEXT:    shrl %esi
; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %edx, %esi
; FALLBACK26-NEXT:    shrl %eax
; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK26-NEXT:    orl %eax, %ecx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK26-NEXT:    movl %edx, (%eax)
; FALLBACK26-NEXT:    movl %esi, 56(%eax)
; FALLBACK26-NEXT:    movl %ecx, 60(%eax)
; FALLBACK26-NEXT:    movl %ebp, 48(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 40(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
; FALLBACK26-NEXT:    addl $204, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    vzeroupper
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: shl_64bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $204, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK27-NEXT:    vmovups 32(%ecx), %ymm1
; FALLBACK27-NEXT:    movl (%eax), %ebx
; FALLBACK27-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    leal (,%ebx,8), %ecx
; FALLBACK27-NEXT:    andl $24, %ecx
; FALLBACK27-NEXT:    andl $60, %ebx
; FALLBACK27-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    subl %ebx, %eax
; FALLBACK27-NEXT:    movl 4(%eax), %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 8(%eax), %edi
; FALLBACK27-NEXT:    movl 12(%eax), %edx
; FALLBACK27-NEXT:    movl %edx, %ebp
; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 16(%eax), %edi
; FALLBACK27-NEXT:    movl 20(%eax), %esi
; FALLBACK27-NEXT:    movl %esi, %ebp
; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 24(%eax), %edi
; FALLBACK27-NEXT:    movl 28(%eax), %edx
; FALLBACK27-NEXT:    movl %edx, %ebp
; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %esi, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 32(%eax), %edi
; FALLBACK27-NEXT:    movl 36(%eax), %esi
; FALLBACK27-NEXT:    movl %esi, %ebp
; FALLBACK27-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK27-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 40(%eax), %ebp
; FALLBACK27-NEXT:    movl 44(%eax), %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %ebp, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shldl %cl, %esi, %ebp
; FALLBACK27-NEXT:    movl 56(%eax), %edx
; FALLBACK27-NEXT:    movl 60(%eax), %edi
; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl (%eax), %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 52(%eax), %esi
; FALLBACK27-NEXT:    shldl %cl, %esi, %edx
; FALLBACK27-NEXT:    negl %ebx
; FALLBACK27-NEXT:    movl 176(%esp,%ebx), %ebx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    movl %edx, 56(%eax)
; FALLBACK27-NEXT:    movl %edi, 60(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK27-NEXT:    shlxl %ecx, %edx, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK27-NEXT:    shldl %cl, %edx, %edi
; FALLBACK27-NEXT:    shldl %cl, %ebx, %esi
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK27-NEXT:    shldl %cl, %edx, %ebx
; FALLBACK27-NEXT:    movl %ebx, 48(%eax)
; FALLBACK27-NEXT:    movl %esi, 52(%eax)
; FALLBACK27-NEXT:    movl %ebp, 40(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 44(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 32(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 36(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 24(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 28(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 16(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 20(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 8(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 12(%eax)
; FALLBACK27-NEXT:    movl %edi, 4(%eax)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, (%eax)
; FALLBACK27-NEXT:    addl $204, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    vzeroupper
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: shl_64bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $204, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK28-NEXT:    movl (%eax), %eax
; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    andl $60, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    subl %edx, %ecx
; FALLBACK28-NEXT:    movl (%ecx), %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 4(%ecx), %edx
; FALLBACK28-NEXT:    movl %ecx, %ebp
; FALLBACK28-NEXT:    shll $3, %eax
; FALLBACK28-NEXT:    andl $24, %eax
; FALLBACK28-NEXT:    movl %edx, %esi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movb %al, %ch
; FALLBACK28-NEXT:    notb %ch
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %esi, %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 12(%ebp), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl 8(%ebp), %esi
; FALLBACK28-NEXT:    movl %ebp, %edi
; FALLBACK28-NEXT:    movl %esi, %ebp
; FALLBACK28-NEXT:    shrl %ebp
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    orl %ebx, %ebp
; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %esi, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %edi, %ebp
; FALLBACK28-NEXT:    movl 20(%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl 16(%edi), %esi
; FALLBACK28-NEXT:    movl %esi, %edx
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %ebx, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %esi, %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %ebp, %edx
; FALLBACK28-NEXT:    movl 28(%ebp), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl 24(%ebp), %esi
; FALLBACK28-NEXT:    movl %esi, %edi
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %ebx, %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK28-NEXT:    shrl %ebp
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    orl %esi, %ebp
; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 36(%edx), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl 32(%edx), %esi
; FALLBACK28-NEXT:    movl %edx, %ebp
; FALLBACK28-NEXT:    movl %esi, %edi
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %ebx, %edi
; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %esi, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 44(%ebp), %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl 40(%ebp), %esi
; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl %esi, %edx
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %ebx, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %esi, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 52(%ebp), %esi
; FALLBACK28-NEXT:    movl %esi, %edi
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    negl %edx
; FALLBACK28-NEXT:    movl 176(%esp,%edx), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    shrl %ebp
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    orl %edi, %ebp
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    orl %ebx, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK28-NEXT:    movl 60(%edi), %edx
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    movl 56(%edi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %edi
; FALLBACK28-NEXT:    shrl %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    orl %edx, %edi
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    shrl %esi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shrl %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl %edx, (%eax)
; FALLBACK28-NEXT:    movl %esi, 56(%eax)
; FALLBACK28-NEXT:    movl %edi, 60(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 48(%eax)
; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 40(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
; FALLBACK28-NEXT:    addl $204, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    vzeroupper
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: shl_64bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $188, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK29-NEXT:    movl (%eax), %ecx
; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %ecx, %ebp
; FALLBACK29-NEXT:    andl $60, %ebp
; FALLBACK29-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    subl %ebp, %eax
; FALLBACK29-NEXT:    movl 8(%eax), %esi
; FALLBACK29-NEXT:    movl 12(%eax), %edx
; FALLBACK29-NEXT:    shll $3, %ecx
; FALLBACK29-NEXT:    andl $24, %ecx
; FALLBACK29-NEXT:    movl %edx, %edi
; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 4(%eax), %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %edi, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 16(%eax), %edi
; FALLBACK29-NEXT:    movl 20(%eax), %esi
; FALLBACK29-NEXT:    movl %esi, %ebx
; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 24(%eax), %edi
; FALLBACK29-NEXT:    movl 28(%eax), %edx
; FALLBACK29-NEXT:    movl %edx, %ebx
; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 32(%eax), %edi
; FALLBACK29-NEXT:    movl 36(%eax), %esi
; FALLBACK29-NEXT:    movl %esi, %ebx
; FALLBACK29-NEXT:    shldl %cl, %edi, %ebx
; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 40(%eax), %edx
; FALLBACK29-NEXT:    movl 44(%eax), %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK29-NEXT:    movl 56(%eax), %edx
; FALLBACK29-NEXT:    movl 60(%eax), %edi
; FALLBACK29-NEXT:    shldl %cl, %edx, %edi
; FALLBACK29-NEXT:    movl (%eax), %ebx
; FALLBACK29-NEXT:    movl 52(%eax), %esi
; FALLBACK29-NEXT:    shldl %cl, %esi, %edx
; FALLBACK29-NEXT:    negl %ebp
; FALLBACK29-NEXT:    movl 160(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
; FALLBACK29-NEXT:    movl %edi, 60(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK29-NEXT:    shldl %cl, %ebx, %edx
; FALLBACK29-NEXT:    shll %cl, %ebx
; FALLBACK29-NEXT:    shldl %cl, %eax, %esi
; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK29-NEXT:    shldl %cl, %edi, %eax
; FALLBACK29-NEXT:    movl %eax, 48(%ebp)
; FALLBACK29-NEXT:    movl %esi, 52(%ebp)
; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
; FALLBACK29-NEXT:    movl %ebx, (%ebp)
; FALLBACK29-NEXT:    movl %edx, 4(%ebp)
; FALLBACK29-NEXT:    addl $188, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    vzeroupper
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: shl_64bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $204, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK30-NEXT:    movl (%eax), %eax
; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    leal (,%eax,8), %edx
; FALLBACK30-NEXT:    andl $24, %edx
; FALLBACK30-NEXT:    andl $60, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
; FALLBACK30-NEXT:    subl %eax, %edi
; FALLBACK30-NEXT:    movl (%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 4(%edi), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl %edx, %ebx
; FALLBACK30-NEXT:    notb %bl
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
; FALLBACK30-NEXT:    shlxl %edx, %eax, %ecx
; FALLBACK30-NEXT:    orl %ecx, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 8(%edi), %esi
; FALLBACK30-NEXT:    movl %esi, %ecx
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK30-NEXT:    movl 12(%edi), %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    shrl %eax
; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 16(%edi), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrl %eax
; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK30-NEXT:    movl 20(%edi), %esi
; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %eax, %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 24(%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK30-NEXT:    movl 28(%edi), %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %eax, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 32(%edi), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrl %eax
; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK30-NEXT:    movl 36(%edi), %esi
; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
; FALLBACK30-NEXT:    orl %eax, %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 40(%edi), %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
; FALLBACK30-NEXT:    movl 44(%edi), %ecx
; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %eax, %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 48(%edi), %esi
; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
; FALLBACK30-NEXT:    movl 52(%edi), %esi
; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrl %ecx
; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
; FALLBACK30-NEXT:    orl %eax, %ebp
; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK30-NEXT:    negl %eax
; FALLBACK30-NEXT:    shlxl %edx, 188(%esp,%eax), %ecx
; FALLBACK30-NEXT:    movl 56(%edi), %eax
; FALLBACK30-NEXT:    shlxl %edx, %eax, %edx
; FALLBACK30-NEXT:    shrl %esi
; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %edx, %esi
; FALLBACK30-NEXT:    shrl %eax
; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
; FALLBACK30-NEXT:    orl %eax, %ecx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK30-NEXT:    movl %edx, (%eax)
; FALLBACK30-NEXT:    movl %esi, 56(%eax)
; FALLBACK30-NEXT:    movl %ecx, 60(%eax)
; FALLBACK30-NEXT:    movl %ebp, 48(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 40(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
; FALLBACK30-NEXT:    addl $204, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    vzeroupper
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: shl_64bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $204, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    vmovups (%ecx), %zmm0
; FALLBACK31-NEXT:    movl (%eax), %ebx
; FALLBACK31-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    leal (,%ebx,8), %ecx
; FALLBACK31-NEXT:    andl $24, %ecx
; FALLBACK31-NEXT:    andl $60, %ebx
; FALLBACK31-NEXT:    leal {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    subl %ebx, %eax
; FALLBACK31-NEXT:    movl 4(%eax), %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 8(%eax), %edi
; FALLBACK31-NEXT:    movl 12(%eax), %edx
; FALLBACK31-NEXT:    movl %edx, %ebp
; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 16(%eax), %edi
; FALLBACK31-NEXT:    movl 20(%eax), %esi
; FALLBACK31-NEXT:    movl %esi, %ebp
; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 24(%eax), %edi
; FALLBACK31-NEXT:    movl 28(%eax), %edx
; FALLBACK31-NEXT:    movl %edx, %ebp
; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %esi, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 32(%eax), %edi
; FALLBACK31-NEXT:    movl 36(%eax), %esi
; FALLBACK31-NEXT:    movl %esi, %ebp
; FALLBACK31-NEXT:    shldl %cl, %edi, %ebp
; FALLBACK31-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 40(%eax), %ebp
; FALLBACK31-NEXT:    movl 44(%eax), %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %ebp, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shldl %cl, %esi, %ebp
; FALLBACK31-NEXT:    movl 56(%eax), %edx
; FALLBACK31-NEXT:    movl 60(%eax), %edi
; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl (%eax), %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 52(%eax), %esi
; FALLBACK31-NEXT:    shldl %cl, %esi, %edx
; FALLBACK31-NEXT:    negl %ebx
; FALLBACK31-NEXT:    movl 176(%esp,%ebx), %ebx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    movl %edx, 56(%eax)
; FALLBACK31-NEXT:    movl %edi, 60(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK31-NEXT:    shlxl %ecx, %edx, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK31-NEXT:    shldl %cl, %edx, %edi
; FALLBACK31-NEXT:    shldl %cl, %ebx, %esi
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK31-NEXT:    shldl %cl, %edx, %ebx
; FALLBACK31-NEXT:    movl %ebx, 48(%eax)
; FALLBACK31-NEXT:    movl %esi, 52(%eax)
; FALLBACK31-NEXT:    movl %ebp, 40(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 44(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 32(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 36(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 24(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 28(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 16(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 20(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 8(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 12(%eax)
; FALLBACK31-NEXT:    movl %edi, 4(%eax)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, (%eax)
; FALLBACK31-NEXT:    addl $204, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    vzeroupper
; FALLBACK31-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %byteOff = load i512, ptr %byteOff.ptr, align 1
  %bitOff = shl i512 %byteOff, 3
  %res = shl i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: shl_64bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    pushq %rbx
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %r9
; X64-SSE2-NEXT:    movq 32(%rdi), %r10
; X64-SSE2-NEXT:    movq 40(%rdi), %r11
; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
; X64-SSE2-NEXT:    movl (%rsi), %esi
; X64-SSE2-NEXT:    xorps %xmm0, %xmm0
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    shll $3, %esi
; X64-SSE2-NEXT:    andl $56, %esi
; X64-SSE2-NEXT:    negl %esi
; X64-SSE2-NEXT:    movslq %esi, %rax
; X64-SSE2-NEXT:    movq -64(%rsp,%rax), %rcx
; X64-SSE2-NEXT:    movq -56(%rsp,%rax), %rsi
; X64-SSE2-NEXT:    movq -40(%rsp,%rax), %rdi
; X64-SSE2-NEXT:    movq -48(%rsp,%rax), %r8
; X64-SSE2-NEXT:    movq -24(%rsp,%rax), %r9
; X64-SSE2-NEXT:    movq -32(%rsp,%rax), %r10
; X64-SSE2-NEXT:    movq -8(%rsp,%rax), %r11
; X64-SSE2-NEXT:    movq -16(%rsp,%rax), %rax
; X64-SSE2-NEXT:    movq %rax, 48(%rdx)
; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rcx, (%rdx)
; X64-SSE2-NEXT:    movq %rsi, 8(%rdx)
; X64-SSE2-NEXT:    popq %rbx
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: shl_64bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    pushq %rax
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
; X64-SSE42-NEXT:    movups 48(%rdi), %xmm3
; X64-SSE42-NEXT:    movl (%rsi), %eax
; X64-SSE42-NEXT:    xorps %xmm4, %xmm4
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    shll $3, %eax
; X64-SSE42-NEXT:    andl $56, %eax
; X64-SSE42-NEXT:    negl %eax
; X64-SSE42-NEXT:    cltq
; X64-SSE42-NEXT:    movups -64(%rsp,%rax), %xmm0
; X64-SSE42-NEXT:    movups -48(%rsp,%rax), %xmm1
; X64-SSE42-NEXT:    movups -32(%rsp,%rax), %xmm2
; X64-SSE42-NEXT:    movups -16(%rsp,%rax), %xmm3
; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    popq %rax
; X64-SSE42-NEXT:    retq
;
; X64-AVX1-LABEL: shl_64bytes_qwordOff:
; X64-AVX1:       # %bb.0:
; X64-AVX1-NEXT:    pushq %rax
; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT:    movl (%rsi), %eax
; X64-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT:    shll $3, %eax
; X64-AVX1-NEXT:    andl $56, %eax
; X64-AVX1-NEXT:    negl %eax
; X64-AVX1-NEXT:    cltq
; X64-AVX1-NEXT:    vmovups -64(%rsp,%rax), %xmm0
; X64-AVX1-NEXT:    vmovups -48(%rsp,%rax), %xmm1
; X64-AVX1-NEXT:    vmovups -32(%rsp,%rax), %xmm2
; X64-AVX1-NEXT:    vmovups -16(%rsp,%rax), %xmm3
; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX1-NEXT:    popq %rax
; X64-AVX1-NEXT:    vzeroupper
; X64-AVX1-NEXT:    retq
;
; X64-AVX512-LABEL: shl_64bytes_qwordOff:
; X64-AVX512:       # %bb.0:
; X64-AVX512-NEXT:    pushq %rax
; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
; X64-AVX512-NEXT:    movl (%rsi), %eax
; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT:    shll $3, %eax
; X64-AVX512-NEXT:    andl $56, %eax
; X64-AVX512-NEXT:    negl %eax
; X64-AVX512-NEXT:    cltq
; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX512-NEXT:    popq %rax
; X64-AVX512-NEXT:    vzeroupper
; X64-AVX512-NEXT:    retq
;
; X86-SSE2-LABEL: shl_64bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $188, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl (%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 12(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 16(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 24(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 32(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 36(%ecx), %eax
; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 40(%ecx), %ebp
; X86-SSE2-NEXT:    movl 44(%ecx), %ebx
; X86-SSE2-NEXT:    movl 48(%ecx), %edi
; X86-SSE2-NEXT:    movl 52(%ecx), %esi
; X86-SSE2-NEXT:    movl 56(%ecx), %edx
; X86-SSE2-NEXT:    movl 60(%ecx), %eax
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT:    movl (%ecx), %ecx
; X86-SSE2-NEXT:    xorps %xmm0, %xmm0
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SSE2-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    shll $3, %ecx
; X86-SSE2-NEXT:    andl $56, %ecx
; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    subl %ecx, %eax
; X86-SSE2-NEXT:    movl (%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 12(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 16(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 24(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 36(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 32(%eax), %edx
; X86-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 44(%eax), %ebp
; X86-SSE2-NEXT:    movl 40(%eax), %ebx
; X86-SSE2-NEXT:    movl 52(%eax), %edi
; X86-SSE2-NEXT:    movl 60(%eax), %esi
; X86-SSE2-NEXT:    movl 56(%eax), %edx
; X86-SSE2-NEXT:    negl %ecx
; X86-SSE2-NEXT:    movl 160(%esp,%ecx), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %edx, 56(%eax)
; X86-SSE2-NEXT:    movl %esi, 60(%eax)
; X86-SSE2-NEXT:    movl %ecx, 48(%eax)
; X86-SSE2-NEXT:    movl %edi, 52(%eax)
; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $188, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: shl_64bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    subl $140, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
; X86-SSE42-NEXT:    movups 48(%edx), %xmm3
; X86-SSE42-NEXT:    movl (%ecx), %ecx
; X86-SSE42-NEXT:    xorps %xmm4, %xmm4
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm4, (%esp)
; X86-SSE42-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    shll $3, %ecx
; X86-SSE42-NEXT:    andl $56, %ecx
; X86-SSE42-NEXT:    leal {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    subl %ecx, %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
; X86-SSE42-NEXT:    negl %ecx
; X86-SSE42-NEXT:    movups 112(%esp,%ecx), %xmm3
; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $140, %esp
; X86-SSE42-NEXT:    retl
;
; X86-AVX1-LABEL: shl_64bytes_qwordOff:
; X86-AVX1:       # %bb.0:
; X86-AVX1-NEXT:    subl $140, %esp
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
; X86-AVX1-NEXT:    vmovups 32(%edx), %ymm1
; X86-AVX1-NEXT:    movl (%ecx), %ecx
; X86-AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
; X86-AVX1-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    vmovups %ymm2, (%esp)
; X86-AVX1-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT:    shll $3, %ecx
; X86-AVX1-NEXT:    andl $56, %ecx
; X86-AVX1-NEXT:    leal {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT:    subl %ecx, %edx
; X86-AVX1-NEXT:    vmovups (%edx), %xmm0
; X86-AVX1-NEXT:    vmovups 16(%edx), %xmm1
; X86-AVX1-NEXT:    vmovups 32(%edx), %xmm2
; X86-AVX1-NEXT:    negl %ecx
; X86-AVX1-NEXT:    vmovups 112(%esp,%ecx), %xmm3
; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX1-NEXT:    addl $140, %esp
; X86-AVX1-NEXT:    vzeroupper
; X86-AVX1-NEXT:    retl
;
; X86-AVX512-LABEL: shl_64bytes_qwordOff:
; X86-AVX512:       # %bb.0:
; X86-AVX512-NEXT:    subl $140, %esp
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
; X86-AVX512-NEXT:    movl (%ecx), %ecx
; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT:    vmovups %zmm1, (%esp)
; X86-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
; X86-AVX512-NEXT:    shll $3, %ecx
; X86-AVX512-NEXT:    andl $56, %ecx
; X86-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT:    subl %ecx, %edx
; X86-AVX512-NEXT:    vmovups (%edx), %xmm0
; X86-AVX512-NEXT:    vmovups 16(%edx), %xmm1
; X86-AVX512-NEXT:    vmovups 32(%edx), %xmm2
; X86-AVX512-NEXT:    negl %ecx
; X86-AVX512-NEXT:    vmovups 112(%esp,%ecx), %xmm3
; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX512-NEXT:    addl $140, %esp
; X86-AVX512-NEXT:    vzeroupper
; X86-AVX512-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
  %bitOff = shl i512 %qwordOff, 6
  %res = shl i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK0-LABEL: ashr_64bytes:
; FALLBACK0:       # %bb.0:
; FALLBACK0-NEXT:    pushq %r15
; FALLBACK0-NEXT:    pushq %r14
; FALLBACK0-NEXT:    pushq %r13
; FALLBACK0-NEXT:    pushq %r12
; FALLBACK0-NEXT:    pushq %rbx
; FALLBACK0-NEXT:    movq (%rdi), %rax
; FALLBACK0-NEXT:    movq 8(%rdi), %rcx
; FALLBACK0-NEXT:    movq 16(%rdi), %r8
; FALLBACK0-NEXT:    movq 24(%rdi), %r9
; FALLBACK0-NEXT:    movq 32(%rdi), %r10
; FALLBACK0-NEXT:    movq 40(%rdi), %r11
; FALLBACK0-NEXT:    movq 48(%rdi), %rbx
; FALLBACK0-NEXT:    movq 56(%rdi), %r14
; FALLBACK0-NEXT:    movl (%rsi), %edi
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    sarq $63, %r14
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
; FALLBACK0-NEXT:    andl $56, %eax
; FALLBACK0-NEXT:    andl $56, %edi
; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
; FALLBACK0-NEXT:    movq %r8, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r11
; FALLBACK0-NEXT:    movl %eax, %esi
; FALLBACK0-NEXT:    notb %sil
; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r9
; FALLBACK0-NEXT:    orq %r11, %r9
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r10
; FALLBACK0-NEXT:    addq %r8, %r8
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r8
; FALLBACK0-NEXT:    orq %r10, %r8
; FALLBACK0-NEXT:    movq -104(%rsp,%rdi), %r10
; FALLBACK0-NEXT:    movq %r10, %r15
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r15
; FALLBACK0-NEXT:    movq -96(%rsp,%rdi), %r14
; FALLBACK0-NEXT:    leaq (%r14,%r14), %r11
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r11
; FALLBACK0-NEXT:    orq %r15, %r11
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %rbx
; FALLBACK0-NEXT:    addq %r10, %r10
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r10
; FALLBACK0-NEXT:    orq %rbx, %r10
; FALLBACK0-NEXT:    movq -88(%rsp,%rdi), %rbx
; FALLBACK0-NEXT:    movq %rbx, %r12
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r12
; FALLBACK0-NEXT:    movq -80(%rsp,%rdi), %r13
; FALLBACK0-NEXT:    leaq (%r13,%r13), %r15
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r15
; FALLBACK0-NEXT:    orq %r12, %r15
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r14
; FALLBACK0-NEXT:    addq %rbx, %rbx
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %rbx
; FALLBACK0-NEXT:    orq %r14, %rbx
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    shrq %cl, %r13
; FALLBACK0-NEXT:    movq -72(%rsp,%rdi), %rdi
; FALLBACK0-NEXT:    leaq (%rdi,%rdi), %r14
; FALLBACK0-NEXT:    movl %esi, %ecx
; FALLBACK0-NEXT:    shlq %cl, %r14
; FALLBACK0-NEXT:    orq %r13, %r14
; FALLBACK0-NEXT:    movl %eax, %ecx
; FALLBACK0-NEXT:    sarq %cl, %rdi
; FALLBACK0-NEXT:    movq %rdi, 56(%rdx)
; FALLBACK0-NEXT:    movq %r14, 48(%rdx)
; FALLBACK0-NEXT:    movq %rbx, 32(%rdx)
; FALLBACK0-NEXT:    movq %r15, 40(%rdx)
; FALLBACK0-NEXT:    movq %r10, 16(%rdx)
; FALLBACK0-NEXT:    movq %r11, 24(%rdx)
; FALLBACK0-NEXT:    movq %r8, (%rdx)
; FALLBACK0-NEXT:    movq %r9, 8(%rdx)
; FALLBACK0-NEXT:    popq %rbx
; FALLBACK0-NEXT:    popq %r12
; FALLBACK0-NEXT:    popq %r13
; FALLBACK0-NEXT:    popq %r14
; FALLBACK0-NEXT:    popq %r15
; FALLBACK0-NEXT:    retq
;
; FALLBACK1-LABEL: ashr_64bytes:
; FALLBACK1:       # %bb.0:
; FALLBACK1-NEXT:    pushq %r15
; FALLBACK1-NEXT:    pushq %r14
; FALLBACK1-NEXT:    pushq %rbx
; FALLBACK1-NEXT:    movq (%rdi), %rcx
; FALLBACK1-NEXT:    movq 8(%rdi), %r8
; FALLBACK1-NEXT:    movq 16(%rdi), %r9
; FALLBACK1-NEXT:    movq 24(%rdi), %r10
; FALLBACK1-NEXT:    movq 32(%rdi), %r11
; FALLBACK1-NEXT:    movq 40(%rdi), %rbx
; FALLBACK1-NEXT:    movq 48(%rdi), %r14
; FALLBACK1-NEXT:    movq 56(%rdi), %rdi
; FALLBACK1-NEXT:    movl (%rsi), %eax
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    sarq $63, %rdi
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
; FALLBACK1-NEXT:    andl $56, %ecx
; FALLBACK1-NEXT:    andl $56, %eax
; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
; FALLBACK1-NEXT:    movq -120(%rsp,%rax), %r9
; FALLBACK1-NEXT:    movq %r9, %r8
; FALLBACK1-NEXT:    shrdq %cl, %rdi, %r8
; FALLBACK1-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK1-NEXT:    movq -104(%rsp,%rax), %r11
; FALLBACK1-NEXT:    movq %r11, %rbx
; FALLBACK1-NEXT:    shrdq %cl, %r10, %rbx
; FALLBACK1-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK1-NEXT:    movq -80(%rsp,%rax), %r11
; FALLBACK1-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK1-NEXT:    movq %r14, %r15
; FALLBACK1-NEXT:    shrdq %cl, %r11, %r15
; FALLBACK1-NEXT:    shrdq %cl, %r14, %r10
; FALLBACK1-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK1-NEXT:    shrdq %cl, %rax, %r11
; FALLBACK1-NEXT:    shrdq %cl, %r9, %rsi
; FALLBACK1-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK1-NEXT:    sarq %cl, %rax
; FALLBACK1-NEXT:    movq %r11, 48(%rdx)
; FALLBACK1-NEXT:    movq %rax, 56(%rdx)
; FALLBACK1-NEXT:    movq %r10, 32(%rdx)
; FALLBACK1-NEXT:    movq %r15, 40(%rdx)
; FALLBACK1-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK1-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK1-NEXT:    movq %rsi, (%rdx)
; FALLBACK1-NEXT:    movq %r8, 8(%rdx)
; FALLBACK1-NEXT:    popq %rbx
; FALLBACK1-NEXT:    popq %r14
; FALLBACK1-NEXT:    popq %r15
; FALLBACK1-NEXT:    retq
;
; FALLBACK2-LABEL: ashr_64bytes:
; FALLBACK2:       # %bb.0:
; FALLBACK2-NEXT:    pushq %rbp
; FALLBACK2-NEXT:    pushq %r15
; FALLBACK2-NEXT:    pushq %r14
; FALLBACK2-NEXT:    pushq %r13
; FALLBACK2-NEXT:    pushq %r12
; FALLBACK2-NEXT:    pushq %rbx
; FALLBACK2-NEXT:    pushq %rax
; FALLBACK2-NEXT:    movq (%rdi), %rcx
; FALLBACK2-NEXT:    movq 8(%rdi), %r8
; FALLBACK2-NEXT:    movq 16(%rdi), %r9
; FALLBACK2-NEXT:    movq 24(%rdi), %r10
; FALLBACK2-NEXT:    movq 32(%rdi), %r11
; FALLBACK2-NEXT:    movq 40(%rdi), %rbx
; FALLBACK2-NEXT:    movq 48(%rdi), %r14
; FALLBACK2-NEXT:    movq 56(%rdi), %rdi
; FALLBACK2-NEXT:    movl (%rsi), %eax
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    sarq $63, %rdi
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
; FALLBACK2-NEXT:    andl $56, %ecx
; FALLBACK2-NEXT:    andl $56, %eax
; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
; FALLBACK2-NEXT:    movl %ecx, %r12d
; FALLBACK2-NEXT:    notb %r12b
; FALLBACK2-NEXT:    addq %r9, %r9
; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
; FALLBACK2-NEXT:    orq %rbx, %r9
; FALLBACK2-NEXT:    addq %rdi, %rdi
; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
; FALLBACK2-NEXT:    orq %r13, %rdi
; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
; FALLBACK2-NEXT:    addq %r10, %r10
; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
; FALLBACK2-NEXT:    orq %r8, %r10
; FALLBACK2-NEXT:    addq %rsi, %rsi
; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
; FALLBACK2-NEXT:    orq %r11, %rsi
; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
; FALLBACK2-NEXT:    orq %r15, %r8
; FALLBACK2-NEXT:    addq %r14, %r14
; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
; FALLBACK2-NEXT:    orq %rbp, %r11
; FALLBACK2-NEXT:    addq %rax, %rax
; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
; FALLBACK2-NEXT:    orq %r13, %rax
; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
; FALLBACK2-NEXT:    movq %rdi, (%rdx)
; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
; FALLBACK2-NEXT:    addq $8, %rsp
; FALLBACK2-NEXT:    popq %rbx
; FALLBACK2-NEXT:    popq %r12
; FALLBACK2-NEXT:    popq %r13
; FALLBACK2-NEXT:    popq %r14
; FALLBACK2-NEXT:    popq %r15
; FALLBACK2-NEXT:    popq %rbp
; FALLBACK2-NEXT:    retq
;
; FALLBACK3-LABEL: ashr_64bytes:
; FALLBACK3:       # %bb.0:
; FALLBACK3-NEXT:    pushq %r15
; FALLBACK3-NEXT:    pushq %r14
; FALLBACK3-NEXT:    pushq %rbx
; FALLBACK3-NEXT:    movq (%rdi), %rcx
; FALLBACK3-NEXT:    movq 8(%rdi), %r8
; FALLBACK3-NEXT:    movq 16(%rdi), %r9
; FALLBACK3-NEXT:    movq 24(%rdi), %r10
; FALLBACK3-NEXT:    movq 32(%rdi), %r11
; FALLBACK3-NEXT:    movq 40(%rdi), %rbx
; FALLBACK3-NEXT:    movq 48(%rdi), %r14
; FALLBACK3-NEXT:    movq 56(%rdi), %rdi
; FALLBACK3-NEXT:    movl (%rsi), %eax
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    sarq $63, %rdi
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
; FALLBACK3-NEXT:    andl $56, %ecx
; FALLBACK3-NEXT:    andl $56, %eax
; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
; FALLBACK3-NEXT:    movq -120(%rsp,%rax), %r9
; FALLBACK3-NEXT:    movq %r9, %r8
; FALLBACK3-NEXT:    shrdq %cl, %rdi, %r8
; FALLBACK3-NEXT:    movq -96(%rsp,%rax), %r10
; FALLBACK3-NEXT:    movq -104(%rsp,%rax), %r11
; FALLBACK3-NEXT:    movq %r11, %rbx
; FALLBACK3-NEXT:    shrdq %cl, %r10, %rbx
; FALLBACK3-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK3-NEXT:    movq -80(%rsp,%rax), %r11
; FALLBACK3-NEXT:    movq -88(%rsp,%rax), %r14
; FALLBACK3-NEXT:    movq %r14, %r15
; FALLBACK3-NEXT:    shrdq %cl, %r11, %r15
; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
; FALLBACK3-NEXT:    movq %rsi, (%rdx)
; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
; FALLBACK3-NEXT:    popq %rbx
; FALLBACK3-NEXT:    popq %r14
; FALLBACK3-NEXT:    popq %r15
; FALLBACK3-NEXT:    retq
;
; FALLBACK4-LABEL: ashr_64bytes:
; FALLBACK4:       # %bb.0:
; FALLBACK4-NEXT:    pushq %rbp
; FALLBACK4-NEXT:    pushq %r15
; FALLBACK4-NEXT:    pushq %r14
; FALLBACK4-NEXT:    pushq %r13
; FALLBACK4-NEXT:    pushq %r12
; FALLBACK4-NEXT:    pushq %rbx
; FALLBACK4-NEXT:    pushq %rax
; FALLBACK4-NEXT:    movups (%rdi), %xmm0
; FALLBACK4-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK4-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK4-NEXT:    movq 48(%rdi), %rax
; FALLBACK4-NEXT:    movq 56(%rdi), %rcx
; FALLBACK4-NEXT:    movl (%rsi), %edi
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    sarq $63, %rcx
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK4-NEXT:    leal (,%rdi,8), %eax
; FALLBACK4-NEXT:    andl $56, %eax
; FALLBACK4-NEXT:    andl $56, %edi
; FALLBACK4-NEXT:    movq -128(%rsp,%rdi), %r10
; FALLBACK4-NEXT:    movq -120(%rsp,%rdi), %r9
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r10
; FALLBACK4-NEXT:    movl %eax, %esi
; FALLBACK4-NEXT:    notb %sil
; FALLBACK4-NEXT:    leaq (%r9,%r9), %r8
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r8
; FALLBACK4-NEXT:    orq %r10, %r8
; FALLBACK4-NEXT:    movq -104(%rsp,%rdi), %r10
; FALLBACK4-NEXT:    movq %r10, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rbx
; FALLBACK4-NEXT:    movq -96(%rsp,%rdi), %r12
; FALLBACK4-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r11
; FALLBACK4-NEXT:    orq %rbx, %r11
; FALLBACK4-NEXT:    movq -112(%rsp,%rdi), %rbx
; FALLBACK4-NEXT:    movq %rbx, %r14
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r14
; FALLBACK4-NEXT:    addq %r10, %r10
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r10
; FALLBACK4-NEXT:    orq %r14, %r10
; FALLBACK4-NEXT:    movq -88(%rsp,%rdi), %r14
; FALLBACK4-NEXT:    movq %r14, %r13
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r13
; FALLBACK4-NEXT:    movq -80(%rsp,%rdi), %rbp
; FALLBACK4-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r15
; FALLBACK4-NEXT:    orq %r13, %r15
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r12
; FALLBACK4-NEXT:    addq %r14, %r14
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r14
; FALLBACK4-NEXT:    orq %r12, %r14
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %rbp
; FALLBACK4-NEXT:    movq -72(%rsp,%rdi), %rdi
; FALLBACK4-NEXT:    leaq (%rdi,%rdi), %r12
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %r12
; FALLBACK4-NEXT:    orq %rbp, %r12
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    shrq %cl, %r9
; FALLBACK4-NEXT:    addq %rbx, %rbx
; FALLBACK4-NEXT:    movl %esi, %ecx
; FALLBACK4-NEXT:    shlq %cl, %rbx
; FALLBACK4-NEXT:    orq %r9, %rbx
; FALLBACK4-NEXT:    movl %eax, %ecx
; FALLBACK4-NEXT:    sarq %cl, %rdi
; FALLBACK4-NEXT:    movq %rdi, 56(%rdx)
; FALLBACK4-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK4-NEXT:    movq %r12, 48(%rdx)
; FALLBACK4-NEXT:    movq %r14, 32(%rdx)
; FALLBACK4-NEXT:    movq %r15, 40(%rdx)
; FALLBACK4-NEXT:    movq %r10, 16(%rdx)
; FALLBACK4-NEXT:    movq %r11, 24(%rdx)
; FALLBACK4-NEXT:    movq %r8, (%rdx)
; FALLBACK4-NEXT:    addq $8, %rsp
; FALLBACK4-NEXT:    popq %rbx
; FALLBACK4-NEXT:    popq %r12
; FALLBACK4-NEXT:    popq %r13
; FALLBACK4-NEXT:    popq %r14
; FALLBACK4-NEXT:    popq %r15
; FALLBACK4-NEXT:    popq %rbp
; FALLBACK4-NEXT:    retq
;
; FALLBACK5-LABEL: ashr_64bytes:
; FALLBACK5:       # %bb.0:
; FALLBACK5-NEXT:    pushq %r15
; FALLBACK5-NEXT:    pushq %r14
; FALLBACK5-NEXT:    pushq %rbx
; FALLBACK5-NEXT:    movups (%rdi), %xmm0
; FALLBACK5-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK5-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK5-NEXT:    movq 48(%rdi), %rcx
; FALLBACK5-NEXT:    movq 56(%rdi), %rdi
; FALLBACK5-NEXT:    movl (%rsi), %eax
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    sarq $63, %rdi
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
; FALLBACK5-NEXT:    andl $56, %ecx
; FALLBACK5-NEXT:    andl $56, %eax
; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq %r9, %rsi
; FALLBACK5-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK5-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK5-NEXT:    movq %r10, %r8
; FALLBACK5-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK5-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK5-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK5-NEXT:    movq %r11, %rbx
; FALLBACK5-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK5-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK5-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK5-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK5-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK5-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK5-NEXT:    movq %rax, %r15
; FALLBACK5-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK5-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK5-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK5-NEXT:    sarq %cl, %r11
; FALLBACK5-NEXT:    movq %r15, 8(%rdx)
; FALLBACK5-NEXT:    movq %r9, 48(%rdx)
; FALLBACK5-NEXT:    movq %r11, 56(%rdx)
; FALLBACK5-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK5-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK5-NEXT:    movq %r8, 16(%rdx)
; FALLBACK5-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK5-NEXT:    movq %r14, (%rdx)
; FALLBACK5-NEXT:    popq %rbx
; FALLBACK5-NEXT:    popq %r14
; FALLBACK5-NEXT:    popq %r15
; FALLBACK5-NEXT:    retq
;
; FALLBACK6-LABEL: ashr_64bytes:
; FALLBACK6:       # %bb.0:
; FALLBACK6-NEXT:    pushq %rbp
; FALLBACK6-NEXT:    pushq %r15
; FALLBACK6-NEXT:    pushq %r14
; FALLBACK6-NEXT:    pushq %r13
; FALLBACK6-NEXT:    pushq %r12
; FALLBACK6-NEXT:    pushq %rbx
; FALLBACK6-NEXT:    pushq %rax
; FALLBACK6-NEXT:    movups (%rdi), %xmm0
; FALLBACK6-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK6-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK6-NEXT:    movq 48(%rdi), %rcx
; FALLBACK6-NEXT:    movq 56(%rdi), %rdi
; FALLBACK6-NEXT:    movl (%rsi), %eax
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    sarq $63, %rdi
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK6-NEXT:    leal (,%rax,8), %esi
; FALLBACK6-NEXT:    andl $56, %esi
; FALLBACK6-NEXT:    andl $56, %eax
; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
; FALLBACK6-NEXT:    movq -96(%rsp,%rax), %r13
; FALLBACK6-NEXT:    shrxq %rsi, %rcx, %r9
; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
; FALLBACK6-NEXT:    movl %esi, %ebx
; FALLBACK6-NEXT:    notb %bl
; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK6-NEXT:    orq %r11, %r8
; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK6-NEXT:    orq %r12, %r11
; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rsi
; FALLBACK6-NEXT:    addq %rdi, %rdi
; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK6-NEXT:    orq %r9, %rdi
; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK6-NEXT:    orq %r14, %r9
; FALLBACK6-NEXT:    addq %r10, %r10
; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK6-NEXT:    orq %r15, %r10
; FALLBACK6-NEXT:    addq %rax, %rax
; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK6-NEXT:    orq %r13, %rax
; FALLBACK6-NEXT:    addq %rcx, %rcx
; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
; FALLBACK6-NEXT:    orq %rbp, %rcx
; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
; FALLBACK6-NEXT:    movq %r8, (%rdx)
; FALLBACK6-NEXT:    addq $8, %rsp
; FALLBACK6-NEXT:    popq %rbx
; FALLBACK6-NEXT:    popq %r12
; FALLBACK6-NEXT:    popq %r13
; FALLBACK6-NEXT:    popq %r14
; FALLBACK6-NEXT:    popq %r15
; FALLBACK6-NEXT:    popq %rbp
; FALLBACK6-NEXT:    retq
;
; FALLBACK7-LABEL: ashr_64bytes:
; FALLBACK7:       # %bb.0:
; FALLBACK7-NEXT:    pushq %r15
; FALLBACK7-NEXT:    pushq %r14
; FALLBACK7-NEXT:    pushq %rbx
; FALLBACK7-NEXT:    movups (%rdi), %xmm0
; FALLBACK7-NEXT:    movups 16(%rdi), %xmm1
; FALLBACK7-NEXT:    movups 32(%rdi), %xmm2
; FALLBACK7-NEXT:    movq 48(%rdi), %rcx
; FALLBACK7-NEXT:    movq 56(%rdi), %rdi
; FALLBACK7-NEXT:    movl (%rsi), %eax
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    sarq $63, %rdi
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
; FALLBACK7-NEXT:    andl $56, %ecx
; FALLBACK7-NEXT:    andl $56, %eax
; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq %r9, %rsi
; FALLBACK7-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK7-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK7-NEXT:    movq %r10, %r8
; FALLBACK7-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK7-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK7-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK7-NEXT:    movq %r11, %rbx
; FALLBACK7-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK7-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK7-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK7-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK7-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK7-NEXT:    movq %rax, %r15
; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK7-NEXT:    sarxq %rcx, %r11, %r10
; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK7-NEXT:    movq %r14, (%rdx)
; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
; FALLBACK7-NEXT:    popq %rbx
; FALLBACK7-NEXT:    popq %r14
; FALLBACK7-NEXT:    popq %r15
; FALLBACK7-NEXT:    retq
;
; FALLBACK8-LABEL: ashr_64bytes:
; FALLBACK8:       # %bb.0:
; FALLBACK8-NEXT:    pushq %rbp
; FALLBACK8-NEXT:    pushq %r15
; FALLBACK8-NEXT:    pushq %r14
; FALLBACK8-NEXT:    pushq %r13
; FALLBACK8-NEXT:    pushq %r12
; FALLBACK8-NEXT:    pushq %rbx
; FALLBACK8-NEXT:    pushq %rax
; FALLBACK8-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK8-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK8-NEXT:    movq 48(%rdi), %rax
; FALLBACK8-NEXT:    movq 56(%rdi), %rcx
; FALLBACK8-NEXT:    movl (%rsi), %edi
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    sarq $63, %rcx
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK8-NEXT:    leal (,%rdi,8), %eax
; FALLBACK8-NEXT:    andl $56, %eax
; FALLBACK8-NEXT:    andl $56, %edi
; FALLBACK8-NEXT:    movq -128(%rsp,%rdi), %r10
; FALLBACK8-NEXT:    movq -120(%rsp,%rdi), %r9
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r10
; FALLBACK8-NEXT:    movl %eax, %esi
; FALLBACK8-NEXT:    notb %sil
; FALLBACK8-NEXT:    leaq (%r9,%r9), %r8
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r8
; FALLBACK8-NEXT:    orq %r10, %r8
; FALLBACK8-NEXT:    movq -104(%rsp,%rdi), %r10
; FALLBACK8-NEXT:    movq %r10, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rbx
; FALLBACK8-NEXT:    movq -96(%rsp,%rdi), %r12
; FALLBACK8-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r11
; FALLBACK8-NEXT:    orq %rbx, %r11
; FALLBACK8-NEXT:    movq -112(%rsp,%rdi), %rbx
; FALLBACK8-NEXT:    movq %rbx, %r14
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r14
; FALLBACK8-NEXT:    addq %r10, %r10
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r10
; FALLBACK8-NEXT:    orq %r14, %r10
; FALLBACK8-NEXT:    movq -88(%rsp,%rdi), %r14
; FALLBACK8-NEXT:    movq %r14, %r13
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r13
; FALLBACK8-NEXT:    movq -80(%rsp,%rdi), %rbp
; FALLBACK8-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r15
; FALLBACK8-NEXT:    orq %r13, %r15
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r12
; FALLBACK8-NEXT:    addq %r14, %r14
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r14
; FALLBACK8-NEXT:    orq %r12, %r14
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %rbp
; FALLBACK8-NEXT:    movq -72(%rsp,%rdi), %rdi
; FALLBACK8-NEXT:    leaq (%rdi,%rdi), %r12
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %r12
; FALLBACK8-NEXT:    orq %rbp, %r12
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    shrq %cl, %r9
; FALLBACK8-NEXT:    addq %rbx, %rbx
; FALLBACK8-NEXT:    movl %esi, %ecx
; FALLBACK8-NEXT:    shlq %cl, %rbx
; FALLBACK8-NEXT:    orq %r9, %rbx
; FALLBACK8-NEXT:    movl %eax, %ecx
; FALLBACK8-NEXT:    sarq %cl, %rdi
; FALLBACK8-NEXT:    movq %rdi, 56(%rdx)
; FALLBACK8-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK8-NEXT:    movq %r12, 48(%rdx)
; FALLBACK8-NEXT:    movq %r14, 32(%rdx)
; FALLBACK8-NEXT:    movq %r15, 40(%rdx)
; FALLBACK8-NEXT:    movq %r10, 16(%rdx)
; FALLBACK8-NEXT:    movq %r11, 24(%rdx)
; FALLBACK8-NEXT:    movq %r8, (%rdx)
; FALLBACK8-NEXT:    addq $8, %rsp
; FALLBACK8-NEXT:    popq %rbx
; FALLBACK8-NEXT:    popq %r12
; FALLBACK8-NEXT:    popq %r13
; FALLBACK8-NEXT:    popq %r14
; FALLBACK8-NEXT:    popq %r15
; FALLBACK8-NEXT:    popq %rbp
; FALLBACK8-NEXT:    vzeroupper
; FALLBACK8-NEXT:    retq
;
; FALLBACK9-LABEL: ashr_64bytes:
; FALLBACK9:       # %bb.0:
; FALLBACK9-NEXT:    pushq %r15
; FALLBACK9-NEXT:    pushq %r14
; FALLBACK9-NEXT:    pushq %rbx
; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK9-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK9-NEXT:    movq 48(%rdi), %rcx
; FALLBACK9-NEXT:    movq 56(%rdi), %rdi
; FALLBACK9-NEXT:    movl (%rsi), %eax
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    sarq $63, %rdi
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
; FALLBACK9-NEXT:    andl $56, %ecx
; FALLBACK9-NEXT:    andl $56, %eax
; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq %r9, %rsi
; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK9-NEXT:    movq %r10, %r8
; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK9-NEXT:    movq %r11, %rbx
; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK9-NEXT:    movq %rax, %r15
; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK9-NEXT:    sarq %cl, %r11
; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK9-NEXT:    movq %r14, (%rdx)
; FALLBACK9-NEXT:    popq %rbx
; FALLBACK9-NEXT:    popq %r14
; FALLBACK9-NEXT:    popq %r15
; FALLBACK9-NEXT:    vzeroupper
; FALLBACK9-NEXT:    retq
;
; FALLBACK10-LABEL: ashr_64bytes:
; FALLBACK10:       # %bb.0:
; FALLBACK10-NEXT:    pushq %rbp
; FALLBACK10-NEXT:    pushq %r15
; FALLBACK10-NEXT:    pushq %r14
; FALLBACK10-NEXT:    pushq %r13
; FALLBACK10-NEXT:    pushq %r12
; FALLBACK10-NEXT:    pushq %rbx
; FALLBACK10-NEXT:    pushq %rax
; FALLBACK10-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK10-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK10-NEXT:    movq 48(%rdi), %rcx
; FALLBACK10-NEXT:    movq 56(%rdi), %rdi
; FALLBACK10-NEXT:    movl (%rsi), %eax
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    sarq $63, %rdi
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK10-NEXT:    leal (,%rax,8), %esi
; FALLBACK10-NEXT:    andl $56, %esi
; FALLBACK10-NEXT:    andl $56, %eax
; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
; FALLBACK10-NEXT:    movq -96(%rsp,%rax), %r13
; FALLBACK10-NEXT:    shrxq %rsi, %rcx, %r9
; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
; FALLBACK10-NEXT:    movl %esi, %ebx
; FALLBACK10-NEXT:    notb %bl
; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK10-NEXT:    orq %r11, %r8
; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK10-NEXT:    orq %r12, %r11
; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rsi
; FALLBACK10-NEXT:    addq %rdi, %rdi
; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK10-NEXT:    orq %r9, %rdi
; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK10-NEXT:    orq %r14, %r9
; FALLBACK10-NEXT:    addq %r10, %r10
; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK10-NEXT:    orq %r15, %r10
; FALLBACK10-NEXT:    addq %rax, %rax
; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK10-NEXT:    orq %r13, %rax
; FALLBACK10-NEXT:    addq %rcx, %rcx
; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
; FALLBACK10-NEXT:    orq %rbp, %rcx
; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
; FALLBACK10-NEXT:    movq %r8, (%rdx)
; FALLBACK10-NEXT:    addq $8, %rsp
; FALLBACK10-NEXT:    popq %rbx
; FALLBACK10-NEXT:    popq %r12
; FALLBACK10-NEXT:    popq %r13
; FALLBACK10-NEXT:    popq %r14
; FALLBACK10-NEXT:    popq %r15
; FALLBACK10-NEXT:    popq %rbp
; FALLBACK10-NEXT:    vzeroupper
; FALLBACK10-NEXT:    retq
;
; FALLBACK11-LABEL: ashr_64bytes:
; FALLBACK11:       # %bb.0:
; FALLBACK11-NEXT:    pushq %r15
; FALLBACK11-NEXT:    pushq %r14
; FALLBACK11-NEXT:    pushq %rbx
; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK11-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK11-NEXT:    movq 48(%rdi), %rcx
; FALLBACK11-NEXT:    movq 56(%rdi), %rdi
; FALLBACK11-NEXT:    movl (%rsi), %eax
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    sarq $63, %rdi
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
; FALLBACK11-NEXT:    andl $56, %ecx
; FALLBACK11-NEXT:    andl $56, %eax
; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq %r9, %rsi
; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK11-NEXT:    movq %r10, %r8
; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK11-NEXT:    movq %r11, %rbx
; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK11-NEXT:    movq %rax, %r15
; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK11-NEXT:    sarxq %rcx, %r11, %r10
; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK11-NEXT:    movq %r14, (%rdx)
; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
; FALLBACK11-NEXT:    popq %rbx
; FALLBACK11-NEXT:    popq %r14
; FALLBACK11-NEXT:    popq %r15
; FALLBACK11-NEXT:    vzeroupper
; FALLBACK11-NEXT:    retq
;
; FALLBACK12-LABEL: ashr_64bytes:
; FALLBACK12:       # %bb.0:
; FALLBACK12-NEXT:    pushq %rbp
; FALLBACK12-NEXT:    pushq %r15
; FALLBACK12-NEXT:    pushq %r14
; FALLBACK12-NEXT:    pushq %r13
; FALLBACK12-NEXT:    pushq %r12
; FALLBACK12-NEXT:    pushq %rbx
; FALLBACK12-NEXT:    pushq %rax
; FALLBACK12-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK12-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK12-NEXT:    movq 48(%rdi), %rax
; FALLBACK12-NEXT:    movq 56(%rdi), %rcx
; FALLBACK12-NEXT:    movl (%rsi), %edi
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    sarq $63, %rcx
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK12-NEXT:    leal (,%rdi,8), %eax
; FALLBACK12-NEXT:    andl $56, %eax
; FALLBACK12-NEXT:    andl $56, %edi
; FALLBACK12-NEXT:    movq -128(%rsp,%rdi), %r10
; FALLBACK12-NEXT:    movq -120(%rsp,%rdi), %r9
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r10
; FALLBACK12-NEXT:    movl %eax, %esi
; FALLBACK12-NEXT:    notb %sil
; FALLBACK12-NEXT:    leaq (%r9,%r9), %r8
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r8
; FALLBACK12-NEXT:    orq %r10, %r8
; FALLBACK12-NEXT:    movq -104(%rsp,%rdi), %r10
; FALLBACK12-NEXT:    movq %r10, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rbx
; FALLBACK12-NEXT:    movq -96(%rsp,%rdi), %r12
; FALLBACK12-NEXT:    leaq (%r12,%r12), %r11
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r11
; FALLBACK12-NEXT:    orq %rbx, %r11
; FALLBACK12-NEXT:    movq -112(%rsp,%rdi), %rbx
; FALLBACK12-NEXT:    movq %rbx, %r14
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r14
; FALLBACK12-NEXT:    addq %r10, %r10
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r10
; FALLBACK12-NEXT:    orq %r14, %r10
; FALLBACK12-NEXT:    movq -88(%rsp,%rdi), %r14
; FALLBACK12-NEXT:    movq %r14, %r13
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r13
; FALLBACK12-NEXT:    movq -80(%rsp,%rdi), %rbp
; FALLBACK12-NEXT:    leaq (%rbp,%rbp), %r15
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r15
; FALLBACK12-NEXT:    orq %r13, %r15
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r12
; FALLBACK12-NEXT:    addq %r14, %r14
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r14
; FALLBACK12-NEXT:    orq %r12, %r14
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %rbp
; FALLBACK12-NEXT:    movq -72(%rsp,%rdi), %rdi
; FALLBACK12-NEXT:    leaq (%rdi,%rdi), %r12
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %r12
; FALLBACK12-NEXT:    orq %rbp, %r12
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    shrq %cl, %r9
; FALLBACK12-NEXT:    addq %rbx, %rbx
; FALLBACK12-NEXT:    movl %esi, %ecx
; FALLBACK12-NEXT:    shlq %cl, %rbx
; FALLBACK12-NEXT:    orq %r9, %rbx
; FALLBACK12-NEXT:    movl %eax, %ecx
; FALLBACK12-NEXT:    sarq %cl, %rdi
; FALLBACK12-NEXT:    movq %rdi, 56(%rdx)
; FALLBACK12-NEXT:    movq %rbx, 8(%rdx)
; FALLBACK12-NEXT:    movq %r12, 48(%rdx)
; FALLBACK12-NEXT:    movq %r14, 32(%rdx)
; FALLBACK12-NEXT:    movq %r15, 40(%rdx)
; FALLBACK12-NEXT:    movq %r10, 16(%rdx)
; FALLBACK12-NEXT:    movq %r11, 24(%rdx)
; FALLBACK12-NEXT:    movq %r8, (%rdx)
; FALLBACK12-NEXT:    addq $8, %rsp
; FALLBACK12-NEXT:    popq %rbx
; FALLBACK12-NEXT:    popq %r12
; FALLBACK12-NEXT:    popq %r13
; FALLBACK12-NEXT:    popq %r14
; FALLBACK12-NEXT:    popq %r15
; FALLBACK12-NEXT:    popq %rbp
; FALLBACK12-NEXT:    vzeroupper
; FALLBACK12-NEXT:    retq
;
; FALLBACK13-LABEL: ashr_64bytes:
; FALLBACK13:       # %bb.0:
; FALLBACK13-NEXT:    pushq %r15
; FALLBACK13-NEXT:    pushq %r14
; FALLBACK13-NEXT:    pushq %rbx
; FALLBACK13-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK13-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK13-NEXT:    movq 48(%rdi), %rcx
; FALLBACK13-NEXT:    movq 56(%rdi), %rdi
; FALLBACK13-NEXT:    movl (%rsi), %eax
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    sarq $63, %rdi
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
; FALLBACK13-NEXT:    andl $56, %ecx
; FALLBACK13-NEXT:    andl $56, %eax
; FALLBACK13-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK13-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK13-NEXT:    movq %r9, %rsi
; FALLBACK13-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK13-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK13-NEXT:    movq %r10, %r8
; FALLBACK13-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK13-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK13-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK13-NEXT:    movq %r11, %rbx
; FALLBACK13-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK13-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK13-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK13-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK13-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK13-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK13-NEXT:    movq %rax, %r15
; FALLBACK13-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK13-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK13-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK13-NEXT:    sarq %cl, %r11
; FALLBACK13-NEXT:    movq %r15, 8(%rdx)
; FALLBACK13-NEXT:    movq %r9, 48(%rdx)
; FALLBACK13-NEXT:    movq %r11, 56(%rdx)
; FALLBACK13-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK13-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK13-NEXT:    movq %r8, 16(%rdx)
; FALLBACK13-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK13-NEXT:    movq %r14, (%rdx)
; FALLBACK13-NEXT:    popq %rbx
; FALLBACK13-NEXT:    popq %r14
; FALLBACK13-NEXT:    popq %r15
; FALLBACK13-NEXT:    vzeroupper
; FALLBACK13-NEXT:    retq
;
; FALLBACK14-LABEL: ashr_64bytes:
; FALLBACK14:       # %bb.0:
; FALLBACK14-NEXT:    pushq %rbp
; FALLBACK14-NEXT:    pushq %r15
; FALLBACK14-NEXT:    pushq %r14
; FALLBACK14-NEXT:    pushq %r13
; FALLBACK14-NEXT:    pushq %r12
; FALLBACK14-NEXT:    pushq %rbx
; FALLBACK14-NEXT:    pushq %rax
; FALLBACK14-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK14-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK14-NEXT:    movq 48(%rdi), %rcx
; FALLBACK14-NEXT:    movq 56(%rdi), %rdi
; FALLBACK14-NEXT:    movl (%rsi), %eax
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    sarq $63, %rdi
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK14-NEXT:    leal (,%rax,8), %esi
; FALLBACK14-NEXT:    andl $56, %esi
; FALLBACK14-NEXT:    andl $56, %eax
; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rcx
; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
; FALLBACK14-NEXT:    shrxq %rsi, %rdi, %r12
; FALLBACK14-NEXT:    movq -96(%rsp,%rax), %r13
; FALLBACK14-NEXT:    shrxq %rsi, %rcx, %r9
; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r14
; FALLBACK14-NEXT:    shrxq %rsi, %r13, %r15
; FALLBACK14-NEXT:    movl %esi, %ebx
; FALLBACK14-NEXT:    notb %bl
; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
; FALLBACK14-NEXT:    orq %r11, %r8
; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
; FALLBACK14-NEXT:    orq %r12, %r11
; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r13
; FALLBACK14-NEXT:    shrxq %rsi, %rbp, %rbp
; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rsi
; FALLBACK14-NEXT:    addq %rdi, %rdi
; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
; FALLBACK14-NEXT:    orq %r9, %rdi
; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
; FALLBACK14-NEXT:    orq %r14, %r9
; FALLBACK14-NEXT:    addq %r10, %r10
; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
; FALLBACK14-NEXT:    orq %r15, %r10
; FALLBACK14-NEXT:    addq %rax, %rax
; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
; FALLBACK14-NEXT:    orq %r13, %rax
; FALLBACK14-NEXT:    addq %rcx, %rcx
; FALLBACK14-NEXT:    shlxq %rbx, %rcx, %rcx
; FALLBACK14-NEXT:    orq %rbp, %rcx
; FALLBACK14-NEXT:    movq %rsi, 56(%rdx)
; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
; FALLBACK14-NEXT:    movq %r8, (%rdx)
; FALLBACK14-NEXT:    addq $8, %rsp
; FALLBACK14-NEXT:    popq %rbx
; FALLBACK14-NEXT:    popq %r12
; FALLBACK14-NEXT:    popq %r13
; FALLBACK14-NEXT:    popq %r14
; FALLBACK14-NEXT:    popq %r15
; FALLBACK14-NEXT:    popq %rbp
; FALLBACK14-NEXT:    vzeroupper
; FALLBACK14-NEXT:    retq
;
; FALLBACK15-LABEL: ashr_64bytes:
; FALLBACK15:       # %bb.0:
; FALLBACK15-NEXT:    pushq %r15
; FALLBACK15-NEXT:    pushq %r14
; FALLBACK15-NEXT:    pushq %rbx
; FALLBACK15-NEXT:    vmovups (%rdi), %ymm0
; FALLBACK15-NEXT:    vmovups 32(%rdi), %xmm1
; FALLBACK15-NEXT:    movq 48(%rdi), %rcx
; FALLBACK15-NEXT:    movq 56(%rdi), %rdi
; FALLBACK15-NEXT:    movl (%rsi), %eax
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    sarq $63, %rdi
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
; FALLBACK15-NEXT:    andl $56, %ecx
; FALLBACK15-NEXT:    andl $56, %eax
; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq %r9, %rsi
; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
; FALLBACK15-NEXT:    movq %r10, %r8
; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
; FALLBACK15-NEXT:    movq %r11, %rbx
; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
; FALLBACK15-NEXT:    movq %rax, %r15
; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
; FALLBACK15-NEXT:    sarxq %rcx, %r11, %r10
; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
; FALLBACK15-NEXT:    movq %r14, (%rdx)
; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
; FALLBACK15-NEXT:    popq %rbx
; FALLBACK15-NEXT:    popq %r14
; FALLBACK15-NEXT:    popq %r15
; FALLBACK15-NEXT:    vzeroupper
; FALLBACK15-NEXT:    retq
;
; FALLBACK16-LABEL: ashr_64bytes:
; FALLBACK16:       # %bb.0:
; FALLBACK16-NEXT:    pushl %ebp
; FALLBACK16-NEXT:    pushl %ebx
; FALLBACK16-NEXT:    pushl %edi
; FALLBACK16-NEXT:    pushl %esi
; FALLBACK16-NEXT:    subl $204, %esp
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK16-NEXT:    movl (%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 4(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 8(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 12(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 16(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 20(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 24(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 28(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 32(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 36(%ecx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 40(%ecx), %ebx
; FALLBACK16-NEXT:    movl 44(%ecx), %edi
; FALLBACK16-NEXT:    movl 48(%ecx), %esi
; FALLBACK16-NEXT:    movl 52(%ecx), %edx
; FALLBACK16-NEXT:    movl 56(%ecx), %eax
; FALLBACK16-NEXT:    movl 60(%ecx), %ecx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK16-NEXT:    movl (%ebp), %ebp
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    sarl $31, %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK16-NEXT:    movl %ebp, %ecx
; FALLBACK16-NEXT:    movl %ebp, %esi
; FALLBACK16-NEXT:    andl $60, %esi
; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK16-NEXT:    shll $3, %ecx
; FALLBACK16-NEXT:    andl $24, %ecx
; FALLBACK16-NEXT:    movl %edx, %eax
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 72(%esp,%esi), %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    addl %edi, %edi
; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK16-NEXT:    movl %ecx, %ebx
; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK16-NEXT:    notb %ch
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %eax, %edi
; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 64(%esp,%esi), %eax
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    addl %edx, %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %eax, %edx
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 76(%esp,%esi), %ebp
; FALLBACK16-NEXT:    movl %ebp, %edx
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %edx, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    addl %ebp, %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %esi, %edx
; FALLBACK16-NEXT:    movl 84(%esp,%esi), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 88(%esp,%esi), %esi
; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    addl %ebx, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %edx, %eax
; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl 92(%esp,%edx), %ebp
; FALLBACK16-NEXT:    movl %ebp, %edx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edx
; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %eax
; FALLBACK16-NEXT:    orl %edx, %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    addl %ebp, %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %esi, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movl 100(%esp,%edx), %eax
; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 104(%esp,%edx), %esi
; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %ebx, %edx
; FALLBACK16-NEXT:    movb %dl, %cl
; FALLBACK16-NEXT:    shrl %cl, %edi
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK16-NEXT:    addl %ebx, %ebx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebx
; FALLBACK16-NEXT:    orl %edi, %ebx
; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %edi
; FALLBACK16-NEXT:    movl %edi, %eax
; FALLBACK16-NEXT:    movl %edx, %ebx
; FALLBACK16-NEXT:    movl %ebx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %ecx
; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movl %ebp, %edx
; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    shrl %cl, %esi
; FALLBACK16-NEXT:    addl %edi, %edi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edi
; FALLBACK16-NEXT:    orl %esi, %edi
; FALLBACK16-NEXT:    movl 116(%esp,%edx), %esi
; FALLBACK16-NEXT:    movl %esi, %eax
; FALLBACK16-NEXT:    movl %ebx, %ecx
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl 120(%esp,%edx), %edx
; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %ebp
; FALLBACK16-NEXT:    orl %eax, %ebp
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    addl %esi, %esi
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %esi
; FALLBACK16-NEXT:    orl %eax, %esi
; FALLBACK16-NEXT:    movb %bl, %cl
; FALLBACK16-NEXT:    movl %edx, %eax
; FALLBACK16-NEXT:    shrl %cl, %eax
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK16-NEXT:    movb %ch, %cl
; FALLBACK16-NEXT:    shll %cl, %edx
; FALLBACK16-NEXT:    orl %eax, %edx
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK16-NEXT:    sarl %cl, %ebx
; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
; FALLBACK16-NEXT:    movl %edx, 56(%eax)
; FALLBACK16-NEXT:    movl %esi, 48(%eax)
; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
; FALLBACK16-NEXT:    movl %edi, 40(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, (%eax)
; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
; FALLBACK16-NEXT:    addl $204, %esp
; FALLBACK16-NEXT:    popl %esi
; FALLBACK16-NEXT:    popl %edi
; FALLBACK16-NEXT:    popl %ebx
; FALLBACK16-NEXT:    popl %ebp
; FALLBACK16-NEXT:    retl
;
; FALLBACK17-LABEL: ashr_64bytes:
; FALLBACK17:       # %bb.0:
; FALLBACK17-NEXT:    pushl %ebp
; FALLBACK17-NEXT:    pushl %ebx
; FALLBACK17-NEXT:    pushl %edi
; FALLBACK17-NEXT:    pushl %esi
; FALLBACK17-NEXT:    subl $188, %esp
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK17-NEXT:    movl (%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 4(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 8(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 12(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 16(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 20(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 24(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 28(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 32(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 36(%eax), %ecx
; FALLBACK17-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 40(%eax), %ebp
; FALLBACK17-NEXT:    movl 44(%eax), %ebx
; FALLBACK17-NEXT:    movl 48(%eax), %edi
; FALLBACK17-NEXT:    movl 52(%eax), %esi
; FALLBACK17-NEXT:    movl 56(%eax), %edx
; FALLBACK17-NEXT:    movl 60(%eax), %eax
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK17-NEXT:    movl (%ecx), %ecx
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    sarl $31, %eax
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT:    movl %ecx, %ebp
; FALLBACK17-NEXT:    andl $60, %ebp
; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shll $3, %ecx
; FALLBACK17-NEXT:    andl $24, %ecx
; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %esi
; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edx
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl %esi, %edx
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK17-NEXT:    movl %eax, %edi
; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK17-NEXT:    sarl %cl, %eax
; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
; FALLBACK17-NEXT:    movl %ebx, (%ebp)
; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
; FALLBACK17-NEXT:    addl $188, %esp
; FALLBACK17-NEXT:    popl %esi
; FALLBACK17-NEXT:    popl %edi
; FALLBACK17-NEXT:    popl %ebx
; FALLBACK17-NEXT:    popl %ebp
; FALLBACK17-NEXT:    retl
;
; FALLBACK18-LABEL: ashr_64bytes:
; FALLBACK18:       # %bb.0:
; FALLBACK18-NEXT:    pushl %ebp
; FALLBACK18-NEXT:    pushl %ebx
; FALLBACK18-NEXT:    pushl %edi
; FALLBACK18-NEXT:    pushl %esi
; FALLBACK18-NEXT:    subl $204, %esp
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 4(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 8(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 12(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 16(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 20(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 24(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 28(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 32(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 36(%eax), %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 40(%eax), %ebp
; FALLBACK18-NEXT:    movl 44(%eax), %ebx
; FALLBACK18-NEXT:    movl 48(%eax), %edi
; FALLBACK18-NEXT:    movl 52(%eax), %esi
; FALLBACK18-NEXT:    movl 56(%eax), %edx
; FALLBACK18-NEXT:    movl 60(%eax), %ecx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl (%eax), %eax
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    sarl $31, %ecx
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK18-NEXT:    movl %eax, %ecx
; FALLBACK18-NEXT:    leal (,%eax,8), %edx
; FALLBACK18-NEXT:    andl $24, %edx
; FALLBACK18-NEXT:    andl $60, %ecx
; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl %edx, %ebx
; FALLBACK18-NEXT:    notb %bl
; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK18-NEXT:    orl %edi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    orl %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK18-NEXT:    orl %eax, %edi
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK18-NEXT:    addl %edi, %edi
; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK18-NEXT:    orl %esi, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK18-NEXT:    movl %ecx, %edi
; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK18-NEXT:    orl %ebp, %eax
; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK18-NEXT:    addl %esi, %esi
; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK18-NEXT:    orl %ecx, %esi
; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK18-NEXT:    orl %edi, %ecx
; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK18-NEXT:    addl %eax, %eax
; FALLBACK18-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK18-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK18-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK18-NEXT:    sarxl %edx, %ebp, %edx
; FALLBACK18-NEXT:    addl %ebp, %ebp
; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK18-NEXT:    orl %eax, %ebx
; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK18-NEXT:    movl %edx, 60(%eax)
; FALLBACK18-NEXT:    movl %ebx, 56(%eax)
; FALLBACK18-NEXT:    movl %edi, 48(%eax)
; FALLBACK18-NEXT:    movl %ecx, 52(%eax)
; FALLBACK18-NEXT:    movl %esi, 40(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 44(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 32(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 36(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 24(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 28(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 16(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 20(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 8(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 12(%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, (%eax)
; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK18-NEXT:    movl %ecx, 4(%eax)
; FALLBACK18-NEXT:    addl $204, %esp
; FALLBACK18-NEXT:    popl %esi
; FALLBACK18-NEXT:    popl %edi
; FALLBACK18-NEXT:    popl %ebx
; FALLBACK18-NEXT:    popl %ebp
; FALLBACK18-NEXT:    retl
;
; FALLBACK19-LABEL: ashr_64bytes:
; FALLBACK19:       # %bb.0:
; FALLBACK19-NEXT:    pushl %ebp
; FALLBACK19-NEXT:    pushl %ebx
; FALLBACK19-NEXT:    pushl %edi
; FALLBACK19-NEXT:    pushl %esi
; FALLBACK19-NEXT:    subl $188, %esp
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK19-NEXT:    movl (%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 4(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 8(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 12(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 16(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 20(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 24(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 28(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 32(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 36(%eax), %ecx
; FALLBACK19-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 40(%eax), %ebp
; FALLBACK19-NEXT:    movl 44(%eax), %ebx
; FALLBACK19-NEXT:    movl 48(%eax), %edi
; FALLBACK19-NEXT:    movl 52(%eax), %esi
; FALLBACK19-NEXT:    movl 56(%eax), %edx
; FALLBACK19-NEXT:    movl 60(%eax), %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK19-NEXT:    movl (%ecx), %ecx
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    sarl $31, %eax
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK19-NEXT:    movl %ecx, %ebp
; FALLBACK19-NEXT:    andl $60, %ebp
; FALLBACK19-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shll $3, %ecx
; FALLBACK19-NEXT:    andl $24, %ecx
; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %esi
; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK19-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK19-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK19-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK19-NEXT:    movl %edi, (%esp) # 4-byte Spill
; FALLBACK19-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK19-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK19-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK19-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl %edi, %edx
; FALLBACK19-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK19-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK19-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK19-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK19-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK19-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK19-NEXT:    movl %eax, 56(%ebp)
; FALLBACK19-NEXT:    movl %esi, 48(%ebp)
; FALLBACK19-NEXT:    movl %edx, 52(%ebp)
; FALLBACK19-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 44(%ebp)
; FALLBACK19-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 32(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 36(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 24(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 28(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 16(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 20(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 8(%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK19-NEXT:    movl %eax, 12(%ebp)
; FALLBACK19-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK19-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK19-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK19-NEXT:    movl %edi, (%ebp)
; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK19-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK19-NEXT:    movl %eax, 60(%ebp)
; FALLBACK19-NEXT:    addl $188, %esp
; FALLBACK19-NEXT:    popl %esi
; FALLBACK19-NEXT:    popl %edi
; FALLBACK19-NEXT:    popl %ebx
; FALLBACK19-NEXT:    popl %ebp
; FALLBACK19-NEXT:    retl
;
; FALLBACK20-LABEL: ashr_64bytes:
; FALLBACK20:       # %bb.0:
; FALLBACK20-NEXT:    pushl %ebp
; FALLBACK20-NEXT:    pushl %ebx
; FALLBACK20-NEXT:    pushl %edi
; FALLBACK20-NEXT:    pushl %esi
; FALLBACK20-NEXT:    subl $204, %esp
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK20-NEXT:    movups (%ecx), %xmm0
; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK20-NEXT:    movl 48(%ecx), %edx
; FALLBACK20-NEXT:    movl 52(%ecx), %esi
; FALLBACK20-NEXT:    movl 56(%ecx), %edi
; FALLBACK20-NEXT:    movl 60(%ecx), %ecx
; FALLBACK20-NEXT:    movl (%eax), %eax
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    sarl $31, %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT:    movl %eax, %esi
; FALLBACK20-NEXT:    andl $60, %esi
; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK20-NEXT:    shll $3, %eax
; FALLBACK20-NEXT:    andl $24, %eax
; FALLBACK20-NEXT:    movl %edx, %edi
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movb %al, %ch
; FALLBACK20-NEXT:    notb %ch
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %edx, %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %edi, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK20-NEXT:    movl %edx, %ebp
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %ebp, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %edx, %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %ebx, %edx
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movl %eax, %edx
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    addl %eax, %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %eax, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK20-NEXT:    movl %ebx, %ebp
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %eax
; FALLBACK20-NEXT:    orl %ebp, %eax
; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edi
; FALLBACK20-NEXT:    addl %ebx, %ebx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %edi, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK20-NEXT:    movl %edi, %ebp
; FALLBACK20-NEXT:    movl %eax, %ecx
; FALLBACK20-NEXT:    shrl %cl, %ebp
; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebx
; FALLBACK20-NEXT:    orl %ebp, %ebx
; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %edx
; FALLBACK20-NEXT:    addl %edi, %edi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edi
; FALLBACK20-NEXT:    orl %edx, %edi
; FALLBACK20-NEXT:    movl %esi, %edx
; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK20-NEXT:    movl %esi, %ebx
; FALLBACK20-NEXT:    movb %al, %cl
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %ebp
; FALLBACK20-NEXT:    orl %ebx, %ebp
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK20-NEXT:    shrl %cl, %ebx
; FALLBACK20-NEXT:    addl %esi, %esi
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %esi
; FALLBACK20-NEXT:    orl %ebx, %esi
; FALLBACK20-NEXT:    movb %dl, %cl
; FALLBACK20-NEXT:    shrl %cl, %eax
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK20-NEXT:    movb %ch, %cl
; FALLBACK20-NEXT:    shll %cl, %edx
; FALLBACK20-NEXT:    orl %eax, %edx
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK20-NEXT:    sarl %cl, %ebx
; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
; FALLBACK20-NEXT:    movl %edx, 56(%eax)
; FALLBACK20-NEXT:    movl %esi, 48(%eax)
; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
; FALLBACK20-NEXT:    movl %edi, 40(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, (%eax)
; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
; FALLBACK20-NEXT:    addl $204, %esp
; FALLBACK20-NEXT:    popl %esi
; FALLBACK20-NEXT:    popl %edi
; FALLBACK20-NEXT:    popl %ebx
; FALLBACK20-NEXT:    popl %ebp
; FALLBACK20-NEXT:    retl
;
; FALLBACK21-LABEL: ashr_64bytes:
; FALLBACK21:       # %bb.0:
; FALLBACK21-NEXT:    pushl %ebp
; FALLBACK21-NEXT:    pushl %ebx
; FALLBACK21-NEXT:    pushl %edi
; FALLBACK21-NEXT:    pushl %esi
; FALLBACK21-NEXT:    subl $188, %esp
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK21-NEXT:    movups (%eax), %xmm0
; FALLBACK21-NEXT:    movups 16(%eax), %xmm1
; FALLBACK21-NEXT:    movups 32(%eax), %xmm2
; FALLBACK21-NEXT:    movl 48(%eax), %edx
; FALLBACK21-NEXT:    movl 52(%eax), %esi
; FALLBACK21-NEXT:    movl 56(%eax), %edi
; FALLBACK21-NEXT:    movl 60(%eax), %eax
; FALLBACK21-NEXT:    movl (%ecx), %ecx
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    sarl $31, %eax
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT:    movl %ecx, %ebp
; FALLBACK21-NEXT:    andl $60, %ebp
; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shll $3, %ecx
; FALLBACK21-NEXT:    andl $24, %ecx
; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %esi
; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edx
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl %esi, %edx
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edi
; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK21-NEXT:    movl %eax, %edi
; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK21-NEXT:    sarl %cl, %eax
; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
; FALLBACK21-NEXT:    movl %ebx, (%ebp)
; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
; FALLBACK21-NEXT:    addl $188, %esp
; FALLBACK21-NEXT:    popl %esi
; FALLBACK21-NEXT:    popl %edi
; FALLBACK21-NEXT:    popl %ebx
; FALLBACK21-NEXT:    popl %ebp
; FALLBACK21-NEXT:    retl
;
; FALLBACK22-LABEL: ashr_64bytes:
; FALLBACK22:       # %bb.0:
; FALLBACK22-NEXT:    pushl %ebp
; FALLBACK22-NEXT:    pushl %ebx
; FALLBACK22-NEXT:    pushl %edi
; FALLBACK22-NEXT:    pushl %esi
; FALLBACK22-NEXT:    subl $204, %esp
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK22-NEXT:    movups (%ecx), %xmm0
; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
; FALLBACK22-NEXT:    movl 48(%ecx), %edx
; FALLBACK22-NEXT:    movl 52(%ecx), %esi
; FALLBACK22-NEXT:    movl 56(%ecx), %edi
; FALLBACK22-NEXT:    movl 60(%ecx), %ecx
; FALLBACK22-NEXT:    movl (%eax), %eax
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    sarl $31, %ecx
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK22-NEXT:    movl %eax, %ecx
; FALLBACK22-NEXT:    leal (,%eax,8), %edx
; FALLBACK22-NEXT:    andl $24, %edx
; FALLBACK22-NEXT:    andl $60, %ecx
; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl %edx, %ebx
; FALLBACK22-NEXT:    notb %bl
; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK22-NEXT:    orl %edi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK22-NEXT:    orl %eax, %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK22-NEXT:    orl %eax, %edi
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK22-NEXT:    addl %edi, %edi
; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK22-NEXT:    orl %esi, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK22-NEXT:    movl %ecx, %edi
; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK22-NEXT:    orl %ebp, %eax
; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK22-NEXT:    addl %esi, %esi
; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK22-NEXT:    orl %ecx, %esi
; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK22-NEXT:    orl %edi, %ecx
; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK22-NEXT:    addl %eax, %eax
; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK22-NEXT:    sarxl %edx, %ebp, %edx
; FALLBACK22-NEXT:    addl %ebp, %ebp
; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK22-NEXT:    orl %eax, %ebx
; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK22-NEXT:    movl %edx, 60(%eax)
; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
; FALLBACK22-NEXT:    movl %edi, 48(%eax)
; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
; FALLBACK22-NEXT:    movl %esi, 40(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 32(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 36(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 24(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 28(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 16(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 20(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 8(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 12(%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, (%eax)
; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK22-NEXT:    movl %ecx, 4(%eax)
; FALLBACK22-NEXT:    addl $204, %esp
; FALLBACK22-NEXT:    popl %esi
; FALLBACK22-NEXT:    popl %edi
; FALLBACK22-NEXT:    popl %ebx
; FALLBACK22-NEXT:    popl %ebp
; FALLBACK22-NEXT:    retl
;
; FALLBACK23-LABEL: ashr_64bytes:
; FALLBACK23:       # %bb.0:
; FALLBACK23-NEXT:    pushl %ebp
; FALLBACK23-NEXT:    pushl %ebx
; FALLBACK23-NEXT:    pushl %edi
; FALLBACK23-NEXT:    pushl %esi
; FALLBACK23-NEXT:    subl $188, %esp
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK23-NEXT:    movups (%eax), %xmm0
; FALLBACK23-NEXT:    movups 16(%eax), %xmm1
; FALLBACK23-NEXT:    movups 32(%eax), %xmm2
; FALLBACK23-NEXT:    movl 48(%eax), %edx
; FALLBACK23-NEXT:    movl 52(%eax), %esi
; FALLBACK23-NEXT:    movl 56(%eax), %edi
; FALLBACK23-NEXT:    movl 60(%eax), %eax
; FALLBACK23-NEXT:    movl (%ecx), %ecx
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    sarl $31, %eax
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK23-NEXT:    movl %ecx, %ebp
; FALLBACK23-NEXT:    andl $60, %ebp
; FALLBACK23-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shll $3, %ecx
; FALLBACK23-NEXT:    andl $24, %ecx
; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %esi
; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK23-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK23-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK23-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK23-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK23-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl %edi, %edx
; FALLBACK23-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK23-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK23-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK23-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK23-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK23-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK23-NEXT:    movl %eax, 56(%ebp)
; FALLBACK23-NEXT:    movl %esi, 48(%ebp)
; FALLBACK23-NEXT:    movl %edx, 52(%ebp)
; FALLBACK23-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 44(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 32(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 36(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 24(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 28(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 16(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 20(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 8(%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK23-NEXT:    movl %eax, 12(%ebp)
; FALLBACK23-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK23-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK23-NEXT:    movl %edi, (%ebp)
; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK23-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK23-NEXT:    movl %eax, 60(%ebp)
; FALLBACK23-NEXT:    addl $188, %esp
; FALLBACK23-NEXT:    popl %esi
; FALLBACK23-NEXT:    popl %edi
; FALLBACK23-NEXT:    popl %ebx
; FALLBACK23-NEXT:    popl %ebp
; FALLBACK23-NEXT:    retl
;
; FALLBACK24-LABEL: ashr_64bytes:
; FALLBACK24:       # %bb.0:
; FALLBACK24-NEXT:    pushl %ebp
; FALLBACK24-NEXT:    pushl %ebx
; FALLBACK24-NEXT:    pushl %edi
; FALLBACK24-NEXT:    pushl %esi
; FALLBACK24-NEXT:    subl $204, %esp
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK24-NEXT:    vmovups 32(%ecx), %xmm1
; FALLBACK24-NEXT:    movl 48(%ecx), %edx
; FALLBACK24-NEXT:    movl 52(%ecx), %esi
; FALLBACK24-NEXT:    movl 56(%ecx), %edi
; FALLBACK24-NEXT:    movl 60(%ecx), %ecx
; FALLBACK24-NEXT:    movl (%eax), %eax
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    sarl $31, %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT:    movl %eax, %esi
; FALLBACK24-NEXT:    andl $60, %esi
; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK24-NEXT:    shll $3, %eax
; FALLBACK24-NEXT:    andl $24, %eax
; FALLBACK24-NEXT:    movl %edx, %edi
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    movl 72(%esp,%esi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK24-NEXT:    movb %al, %ch
; FALLBACK24-NEXT:    notb %ch
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %edx, %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %edi, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK24-NEXT:    movl %edx, %ebp
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %ebp, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %edx, %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %ebx, %edx
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movl %eax, %edx
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    addl %eax, %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %eax, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK24-NEXT:    movl %ebx, %ebp
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %eax
; FALLBACK24-NEXT:    orl %ebp, %eax
; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edi
; FALLBACK24-NEXT:    addl %ebx, %ebx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %edi, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK24-NEXT:    movl %edi, %ebp
; FALLBACK24-NEXT:    movl %eax, %ecx
; FALLBACK24-NEXT:    shrl %cl, %ebp
; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebx
; FALLBACK24-NEXT:    orl %ebp, %ebx
; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %edx
; FALLBACK24-NEXT:    addl %edi, %edi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edi
; FALLBACK24-NEXT:    orl %edx, %edi
; FALLBACK24-NEXT:    movl %esi, %edx
; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK24-NEXT:    movl %esi, %ebx
; FALLBACK24-NEXT:    movb %al, %cl
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %ebp
; FALLBACK24-NEXT:    orl %ebx, %ebp
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK24-NEXT:    shrl %cl, %ebx
; FALLBACK24-NEXT:    addl %esi, %esi
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %esi
; FALLBACK24-NEXT:    orl %ebx, %esi
; FALLBACK24-NEXT:    movb %dl, %cl
; FALLBACK24-NEXT:    shrl %cl, %eax
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK24-NEXT:    movb %ch, %cl
; FALLBACK24-NEXT:    shll %cl, %edx
; FALLBACK24-NEXT:    orl %eax, %edx
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK24-NEXT:    sarl %cl, %ebx
; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
; FALLBACK24-NEXT:    movl %edx, 56(%eax)
; FALLBACK24-NEXT:    movl %esi, 48(%eax)
; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
; FALLBACK24-NEXT:    movl %edi, 40(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, (%eax)
; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
; FALLBACK24-NEXT:    addl $204, %esp
; FALLBACK24-NEXT:    popl %esi
; FALLBACK24-NEXT:    popl %edi
; FALLBACK24-NEXT:    popl %ebx
; FALLBACK24-NEXT:    popl %ebp
; FALLBACK24-NEXT:    vzeroupper
; FALLBACK24-NEXT:    retl
;
; FALLBACK25-LABEL: ashr_64bytes:
; FALLBACK25:       # %bb.0:
; FALLBACK25-NEXT:    pushl %ebp
; FALLBACK25-NEXT:    pushl %ebx
; FALLBACK25-NEXT:    pushl %edi
; FALLBACK25-NEXT:    pushl %esi
; FALLBACK25-NEXT:    subl $188, %esp
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK25-NEXT:    vmovups (%eax), %ymm0
; FALLBACK25-NEXT:    vmovups 32(%eax), %xmm1
; FALLBACK25-NEXT:    movl 48(%eax), %edx
; FALLBACK25-NEXT:    movl 52(%eax), %esi
; FALLBACK25-NEXT:    movl 56(%eax), %edi
; FALLBACK25-NEXT:    movl 60(%eax), %eax
; FALLBACK25-NEXT:    movl (%ecx), %ecx
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    sarl $31, %eax
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT:    movl %ecx, %ebp
; FALLBACK25-NEXT:    andl $60, %ebp
; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shll $3, %ecx
; FALLBACK25-NEXT:    andl $24, %ecx
; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %esi
; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edx
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl %esi, %edx
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edi
; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK25-NEXT:    movl %eax, %edi
; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK25-NEXT:    sarl %cl, %eax
; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
; FALLBACK25-NEXT:    movl %ebx, (%ebp)
; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
; FALLBACK25-NEXT:    addl $188, %esp
; FALLBACK25-NEXT:    popl %esi
; FALLBACK25-NEXT:    popl %edi
; FALLBACK25-NEXT:    popl %ebx
; FALLBACK25-NEXT:    popl %ebp
; FALLBACK25-NEXT:    vzeroupper
; FALLBACK25-NEXT:    retl
;
; FALLBACK26-LABEL: ashr_64bytes:
; FALLBACK26:       # %bb.0:
; FALLBACK26-NEXT:    pushl %ebp
; FALLBACK26-NEXT:    pushl %ebx
; FALLBACK26-NEXT:    pushl %edi
; FALLBACK26-NEXT:    pushl %esi
; FALLBACK26-NEXT:    subl $204, %esp
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK26-NEXT:    vmovups 32(%ecx), %xmm1
; FALLBACK26-NEXT:    movl 48(%ecx), %edx
; FALLBACK26-NEXT:    movl 52(%ecx), %esi
; FALLBACK26-NEXT:    movl 56(%ecx), %edi
; FALLBACK26-NEXT:    movl 60(%ecx), %ecx
; FALLBACK26-NEXT:    movl (%eax), %eax
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    sarl $31, %ecx
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK26-NEXT:    movl %eax, %ecx
; FALLBACK26-NEXT:    leal (,%eax,8), %edx
; FALLBACK26-NEXT:    andl $24, %edx
; FALLBACK26-NEXT:    andl $60, %ecx
; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl %edx, %ebx
; FALLBACK26-NEXT:    notb %bl
; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK26-NEXT:    orl %edi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK26-NEXT:    orl %eax, %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK26-NEXT:    orl %eax, %edi
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK26-NEXT:    addl %edi, %edi
; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK26-NEXT:    orl %esi, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK26-NEXT:    movl %ecx, %edi
; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK26-NEXT:    orl %ebp, %eax
; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK26-NEXT:    addl %esi, %esi
; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK26-NEXT:    orl %ecx, %esi
; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK26-NEXT:    orl %edi, %ecx
; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK26-NEXT:    addl %eax, %eax
; FALLBACK26-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK26-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK26-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK26-NEXT:    sarxl %edx, %ebp, %edx
; FALLBACK26-NEXT:    addl %ebp, %ebp
; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK26-NEXT:    orl %eax, %ebx
; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK26-NEXT:    movl %edx, 60(%eax)
; FALLBACK26-NEXT:    movl %ebx, 56(%eax)
; FALLBACK26-NEXT:    movl %edi, 48(%eax)
; FALLBACK26-NEXT:    movl %ecx, 52(%eax)
; FALLBACK26-NEXT:    movl %esi, 40(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 44(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 32(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 36(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 24(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 28(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 16(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 20(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 8(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 12(%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, (%eax)
; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK26-NEXT:    movl %ecx, 4(%eax)
; FALLBACK26-NEXT:    addl $204, %esp
; FALLBACK26-NEXT:    popl %esi
; FALLBACK26-NEXT:    popl %edi
; FALLBACK26-NEXT:    popl %ebx
; FALLBACK26-NEXT:    popl %ebp
; FALLBACK26-NEXT:    vzeroupper
; FALLBACK26-NEXT:    retl
;
; FALLBACK27-LABEL: ashr_64bytes:
; FALLBACK27:       # %bb.0:
; FALLBACK27-NEXT:    pushl %ebp
; FALLBACK27-NEXT:    pushl %ebx
; FALLBACK27-NEXT:    pushl %edi
; FALLBACK27-NEXT:    pushl %esi
; FALLBACK27-NEXT:    subl $188, %esp
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK27-NEXT:    vmovups (%eax), %ymm0
; FALLBACK27-NEXT:    vmovups 32(%eax), %xmm1
; FALLBACK27-NEXT:    movl 48(%eax), %edx
; FALLBACK27-NEXT:    movl 52(%eax), %esi
; FALLBACK27-NEXT:    movl 56(%eax), %edi
; FALLBACK27-NEXT:    movl 60(%eax), %eax
; FALLBACK27-NEXT:    movl (%ecx), %ecx
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    sarl $31, %eax
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK27-NEXT:    movl %ecx, %ebp
; FALLBACK27-NEXT:    andl $60, %ebp
; FALLBACK27-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shll $3, %ecx
; FALLBACK27-NEXT:    andl $24, %ecx
; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %esi
; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK27-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK27-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK27-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK27-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK27-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK27-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK27-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK27-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl %edi, %edx
; FALLBACK27-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK27-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK27-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK27-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK27-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK27-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK27-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK27-NEXT:    movl %eax, 56(%ebp)
; FALLBACK27-NEXT:    movl %esi, 48(%ebp)
; FALLBACK27-NEXT:    movl %edx, 52(%ebp)
; FALLBACK27-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 44(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 32(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 36(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 24(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 28(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 16(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 20(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 8(%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK27-NEXT:    movl %eax, 12(%ebp)
; FALLBACK27-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK27-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK27-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK27-NEXT:    movl %edi, (%ebp)
; FALLBACK27-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK27-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK27-NEXT:    movl %eax, 60(%ebp)
; FALLBACK27-NEXT:    addl $188, %esp
; FALLBACK27-NEXT:    popl %esi
; FALLBACK27-NEXT:    popl %edi
; FALLBACK27-NEXT:    popl %ebx
; FALLBACK27-NEXT:    popl %ebp
; FALLBACK27-NEXT:    vzeroupper
; FALLBACK27-NEXT:    retl
;
; FALLBACK28-LABEL: ashr_64bytes:
; FALLBACK28:       # %bb.0:
; FALLBACK28-NEXT:    pushl %ebp
; FALLBACK28-NEXT:    pushl %ebx
; FALLBACK28-NEXT:    pushl %edi
; FALLBACK28-NEXT:    pushl %esi
; FALLBACK28-NEXT:    subl $204, %esp
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK28-NEXT:    vmovups 32(%ecx), %xmm1
; FALLBACK28-NEXT:    movl 48(%ecx), %edx
; FALLBACK28-NEXT:    movl 52(%ecx), %esi
; FALLBACK28-NEXT:    movl 56(%ecx), %edi
; FALLBACK28-NEXT:    movl 60(%ecx), %ecx
; FALLBACK28-NEXT:    movl (%eax), %eax
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    sarl $31, %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT:    movl %eax, %esi
; FALLBACK28-NEXT:    andl $60, %esi
; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
; FALLBACK28-NEXT:    shll $3, %eax
; FALLBACK28-NEXT:    andl $24, %eax
; FALLBACK28-NEXT:    movl %edx, %edi
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    movl 72(%esp,%esi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK28-NEXT:    movb %al, %ch
; FALLBACK28-NEXT:    notb %ch
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %edx, %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %edi, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
; FALLBACK28-NEXT:    movl %edx, %ebp
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %ebp, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %edx, %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %ebx, %edx
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movl %eax, %edx
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    addl %eax, %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %eax, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
; FALLBACK28-NEXT:    movl %ebx, %ebp
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %eax
; FALLBACK28-NEXT:    orl %ebp, %eax
; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edi
; FALLBACK28-NEXT:    addl %ebx, %ebx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %edi, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
; FALLBACK28-NEXT:    movl %edi, %ebp
; FALLBACK28-NEXT:    movl %eax, %ecx
; FALLBACK28-NEXT:    shrl %cl, %ebp
; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebx
; FALLBACK28-NEXT:    orl %ebp, %ebx
; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %edx
; FALLBACK28-NEXT:    addl %edi, %edi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edi
; FALLBACK28-NEXT:    orl %edx, %edi
; FALLBACK28-NEXT:    movl %esi, %edx
; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
; FALLBACK28-NEXT:    movl %esi, %ebx
; FALLBACK28-NEXT:    movb %al, %cl
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %ebp
; FALLBACK28-NEXT:    orl %ebx, %ebp
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; FALLBACK28-NEXT:    shrl %cl, %ebx
; FALLBACK28-NEXT:    addl %esi, %esi
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %esi
; FALLBACK28-NEXT:    orl %ebx, %esi
; FALLBACK28-NEXT:    movb %dl, %cl
; FALLBACK28-NEXT:    shrl %cl, %eax
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
; FALLBACK28-NEXT:    movb %ch, %cl
; FALLBACK28-NEXT:    shll %cl, %edx
; FALLBACK28-NEXT:    orl %eax, %edx
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK28-NEXT:    sarl %cl, %ebx
; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
; FALLBACK28-NEXT:    movl %edx, 56(%eax)
; FALLBACK28-NEXT:    movl %esi, 48(%eax)
; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
; FALLBACK28-NEXT:    movl %edi, 40(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, (%eax)
; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
; FALLBACK28-NEXT:    addl $204, %esp
; FALLBACK28-NEXT:    popl %esi
; FALLBACK28-NEXT:    popl %edi
; FALLBACK28-NEXT:    popl %ebx
; FALLBACK28-NEXT:    popl %ebp
; FALLBACK28-NEXT:    vzeroupper
; FALLBACK28-NEXT:    retl
;
; FALLBACK29-LABEL: ashr_64bytes:
; FALLBACK29:       # %bb.0:
; FALLBACK29-NEXT:    pushl %ebp
; FALLBACK29-NEXT:    pushl %ebx
; FALLBACK29-NEXT:    pushl %edi
; FALLBACK29-NEXT:    pushl %esi
; FALLBACK29-NEXT:    subl $188, %esp
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK29-NEXT:    vmovups (%eax), %ymm0
; FALLBACK29-NEXT:    vmovups 32(%eax), %xmm1
; FALLBACK29-NEXT:    movl 48(%eax), %edx
; FALLBACK29-NEXT:    movl 52(%eax), %esi
; FALLBACK29-NEXT:    movl 56(%eax), %edi
; FALLBACK29-NEXT:    movl 60(%eax), %eax
; FALLBACK29-NEXT:    movl (%ecx), %ecx
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    sarl $31, %eax
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT:    movl %ecx, %ebp
; FALLBACK29-NEXT:    andl $60, %ebp
; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shll $3, %ecx
; FALLBACK29-NEXT:    andl $24, %ecx
; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %esi
; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edx
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl %esi, %edx
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edi
; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
; FALLBACK29-NEXT:    movl %eax, %edi
; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK29-NEXT:    sarl %cl, %eax
; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
; FALLBACK29-NEXT:    movl %ebx, (%ebp)
; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
; FALLBACK29-NEXT:    addl $188, %esp
; FALLBACK29-NEXT:    popl %esi
; FALLBACK29-NEXT:    popl %edi
; FALLBACK29-NEXT:    popl %ebx
; FALLBACK29-NEXT:    popl %ebp
; FALLBACK29-NEXT:    vzeroupper
; FALLBACK29-NEXT:    retl
;
; FALLBACK30-LABEL: ashr_64bytes:
; FALLBACK30:       # %bb.0:
; FALLBACK30-NEXT:    pushl %ebp
; FALLBACK30-NEXT:    pushl %ebx
; FALLBACK30-NEXT:    pushl %edi
; FALLBACK30-NEXT:    pushl %esi
; FALLBACK30-NEXT:    subl $204, %esp
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
; FALLBACK30-NEXT:    vmovups 32(%ecx), %xmm1
; FALLBACK30-NEXT:    movl 48(%ecx), %edx
; FALLBACK30-NEXT:    movl 52(%ecx), %esi
; FALLBACK30-NEXT:    movl 56(%ecx), %edi
; FALLBACK30-NEXT:    movl 60(%ecx), %ecx
; FALLBACK30-NEXT:    movl (%eax), %eax
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    sarl $31, %ecx
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; FALLBACK30-NEXT:    movl %eax, %ecx
; FALLBACK30-NEXT:    leal (,%eax,8), %edx
; FALLBACK30-NEXT:    andl $24, %edx
; FALLBACK30-NEXT:    andl $60, %ecx
; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl %edx, %ebx
; FALLBACK30-NEXT:    notb %bl
; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK30-NEXT:    orl %edi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 80(%esp,%ecx), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 76(%esp,%ecx), %edi
; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK30-NEXT:    orl %eax, %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
; FALLBACK30-NEXT:    orl %eax, %edi
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
; FALLBACK30-NEXT:    addl %edi, %edi
; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
; FALLBACK30-NEXT:    orl %esi, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
; FALLBACK30-NEXT:    movl %ecx, %edi
; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
; FALLBACK30-NEXT:    orl %ebp, %eax
; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; FALLBACK30-NEXT:    addl %esi, %esi
; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
; FALLBACK30-NEXT:    orl %ecx, %esi
; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
; FALLBACK30-NEXT:    orl %edi, %ecx
; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK30-NEXT:    addl %eax, %eax
; FALLBACK30-NEXT:    shlxl %ebx, %eax, %edi
; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; FALLBACK30-NEXT:    shrxl %edx, %ebp, %eax
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; FALLBACK30-NEXT:    movl 124(%esp,%ebp), %ebp
; FALLBACK30-NEXT:    sarxl %edx, %ebp, %edx
; FALLBACK30-NEXT:    addl %ebp, %ebp
; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebx
; FALLBACK30-NEXT:    orl %eax, %ebx
; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK30-NEXT:    movl %edx, 60(%eax)
; FALLBACK30-NEXT:    movl %ebx, 56(%eax)
; FALLBACK30-NEXT:    movl %edi, 48(%eax)
; FALLBACK30-NEXT:    movl %ecx, 52(%eax)
; FALLBACK30-NEXT:    movl %esi, 40(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 44(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 32(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 36(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 24(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 28(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 16(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 20(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 8(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 12(%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, (%eax)
; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK30-NEXT:    movl %ecx, 4(%eax)
; FALLBACK30-NEXT:    addl $204, %esp
; FALLBACK30-NEXT:    popl %esi
; FALLBACK30-NEXT:    popl %edi
; FALLBACK30-NEXT:    popl %ebx
; FALLBACK30-NEXT:    popl %ebp
; FALLBACK30-NEXT:    vzeroupper
; FALLBACK30-NEXT:    retl
;
; FALLBACK31-LABEL: ashr_64bytes:
; FALLBACK31:       # %bb.0:
; FALLBACK31-NEXT:    pushl %ebp
; FALLBACK31-NEXT:    pushl %ebx
; FALLBACK31-NEXT:    pushl %edi
; FALLBACK31-NEXT:    pushl %esi
; FALLBACK31-NEXT:    subl $188, %esp
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %eax
; FALLBACK31-NEXT:    vmovups (%eax), %ymm0
; FALLBACK31-NEXT:    vmovups 32(%eax), %xmm1
; FALLBACK31-NEXT:    movl 48(%eax), %edx
; FALLBACK31-NEXT:    movl 52(%eax), %esi
; FALLBACK31-NEXT:    movl 56(%eax), %edi
; FALLBACK31-NEXT:    movl 60(%eax), %eax
; FALLBACK31-NEXT:    movl (%ecx), %ecx
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    sarl $31, %eax
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %eax, {{[0-9]+}}(%esp)
; FALLBACK31-NEXT:    movl %ecx, %ebp
; FALLBACK31-NEXT:    andl $60, %ebp
; FALLBACK31-NEXT:    movl 56(%esp,%ebp), %edx
; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shll $3, %ecx
; FALLBACK31-NEXT:    andl $24, %ecx
; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 60(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %esi
; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 72(%esp,%ebp), %esi
; FALLBACK31-NEXT:    movl 68(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 80(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 76(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %edi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %esi
; FALLBACK31-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 88(%esp,%ebp), %ebx
; FALLBACK31-NEXT:    movl 84(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %ebx, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edi
; FALLBACK31-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    movl 96(%esp,%ebp), %esi
; FALLBACK31-NEXT:    movl 92(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %esi, %edx
; FALLBACK31-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %eax, %ebx
; FALLBACK31-NEXT:    movl 104(%esp,%ebp), %eax
; FALLBACK31-NEXT:    movl 100(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl %edi, %edx
; FALLBACK31-NEXT:    shrdl %cl, %eax, %edx
; FALLBACK31-NEXT:    shrdl %cl, %edi, %esi
; FALLBACK31-NEXT:    movl 48(%esp,%ebp), %edi
; FALLBACK31-NEXT:    movl 108(%esp,%ebp), %ebp
; FALLBACK31-NEXT:    movl %ebp, (%esp) # 4-byte Spill
; FALLBACK31-NEXT:    shrdl %cl, %ebp, %eax
; FALLBACK31-NEXT:    movl {{[0-9]+}}(%esp), %ebp
; FALLBACK31-NEXT:    movl %eax, 56(%ebp)
; FALLBACK31-NEXT:    movl %esi, 48(%ebp)
; FALLBACK31-NEXT:    movl %edx, 52(%ebp)
; FALLBACK31-NEXT:    movl %ebx, 40(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 44(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 32(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 36(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 24(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 28(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 16(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 20(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 8(%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK31-NEXT:    movl %eax, 12(%ebp)
; FALLBACK31-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
; FALLBACK31-NEXT:    # kill: def $cl killed $cl killed $ecx
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; FALLBACK31-NEXT:    shrdl %cl, %edx, %edi
; FALLBACK31-NEXT:    movl %edi, (%ebp)
; FALLBACK31-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; FALLBACK31-NEXT:    movl %ecx, 4(%ebp)
; FALLBACK31-NEXT:    movl %eax, 60(%ebp)
; FALLBACK31-NEXT:    addl $188, %esp
; FALLBACK31-NEXT:    popl %esi
; FALLBACK31-NEXT:    popl %edi
; FALLBACK31-NEXT:    popl %ebx
; FALLBACK31-NEXT:    popl %ebp
; FALLBACK31-NEXT:    vzeroupper
; FALLBACK31-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %byteOff = load i512, ptr %byteOff.ptr, align 1
  %bitOff = shl i512 %byteOff, 3
  %res = ashr i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
; X64-SSE2:       # %bb.0:
; X64-SSE2-NEXT:    pushq %rbx
; X64-SSE2-NEXT:    movq (%rdi), %rax
; X64-SSE2-NEXT:    movq 8(%rdi), %rcx
; X64-SSE2-NEXT:    movq 16(%rdi), %r8
; X64-SSE2-NEXT:    movq 24(%rdi), %r9
; X64-SSE2-NEXT:    movq 32(%rdi), %r10
; X64-SSE2-NEXT:    movq 40(%rdi), %r11
; X64-SSE2-NEXT:    movq 48(%rdi), %rbx
; X64-SSE2-NEXT:    movq 56(%rdi), %rdi
; X64-SSE2-NEXT:    movl (%rsi), %esi
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    sarq $63, %rdi
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT:    andl $7, %esi
; X64-SSE2-NEXT:    movq -128(%rsp,%rsi,8), %rax
; X64-SSE2-NEXT:    movq -120(%rsp,%rsi,8), %rcx
; X64-SSE2-NEXT:    movq -104(%rsp,%rsi,8), %rdi
; X64-SSE2-NEXT:    movq -112(%rsp,%rsi,8), %r8
; X64-SSE2-NEXT:    movq -88(%rsp,%rsi,8), %r9
; X64-SSE2-NEXT:    movq -96(%rsp,%rsi,8), %r10
; X64-SSE2-NEXT:    movq -72(%rsp,%rsi,8), %r11
; X64-SSE2-NEXT:    movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT:    movq %rsi, 48(%rdx)
; X64-SSE2-NEXT:    movq %r11, 56(%rdx)
; X64-SSE2-NEXT:    movq %r10, 32(%rdx)
; X64-SSE2-NEXT:    movq %r9, 40(%rdx)
; X64-SSE2-NEXT:    movq %r8, 16(%rdx)
; X64-SSE2-NEXT:    movq %rdi, 24(%rdx)
; X64-SSE2-NEXT:    movq %rax, (%rdx)
; X64-SSE2-NEXT:    movq %rcx, 8(%rdx)
; X64-SSE2-NEXT:    popq %rbx
; X64-SSE2-NEXT:    retq
;
; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
; X64-SSE42:       # %bb.0:
; X64-SSE42-NEXT:    pushq %rax
; X64-SSE42-NEXT:    movups (%rdi), %xmm0
; X64-SSE42-NEXT:    movups 16(%rdi), %xmm1
; X64-SSE42-NEXT:    movups 32(%rdi), %xmm2
; X64-SSE42-NEXT:    movq 48(%rdi), %rax
; X64-SSE42-NEXT:    movq 56(%rdi), %rcx
; X64-SSE42-NEXT:    movl (%rsi), %esi
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    sarq $63, %rcx
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT:    andl $7, %esi
; X64-SSE42-NEXT:    movups -128(%rsp,%rsi,8), %xmm0
; X64-SSE42-NEXT:    movups -112(%rsp,%rsi,8), %xmm1
; X64-SSE42-NEXT:    movups -96(%rsp,%rsi,8), %xmm2
; X64-SSE42-NEXT:    movups -80(%rsp,%rsi,8), %xmm3
; X64-SSE42-NEXT:    movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT:    movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT:    movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT:    movups %xmm0, (%rdx)
; X64-SSE42-NEXT:    popq %rax
; X64-SSE42-NEXT:    retq
;
; X64-AVX-LABEL: ashr_64bytes_qwordOff:
; X64-AVX:       # %bb.0:
; X64-AVX-NEXT:    pushq %rax
; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
; X64-AVX-NEXT:    movq 48(%rdi), %rax
; X64-AVX-NEXT:    movq 56(%rdi), %rcx
; X64-AVX-NEXT:    movl (%rsi), %esi
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    sarq $63, %rcx
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT:    andl $7, %esi
; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi,8), %xmm0
; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi,8), %xmm1
; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi,8), %xmm2
; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi,8), %xmm3
; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
; X64-AVX-NEXT:    popq %rax
; X64-AVX-NEXT:    vzeroupper
; X64-AVX-NEXT:    retq
;
; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
; X86-SSE2:       # %bb.0:
; X86-SSE2-NEXT:    pushl %ebp
; X86-SSE2-NEXT:    pushl %ebx
; X86-SSE2-NEXT:    pushl %edi
; X86-SSE2-NEXT:    pushl %esi
; X86-SSE2-NEXT:    subl $188, %esp
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 4(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 8(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 12(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 16(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 20(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 24(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 28(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 32(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 36(%eax), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 40(%eax), %ebp
; X86-SSE2-NEXT:    movl 44(%eax), %ebx
; X86-SSE2-NEXT:    movl 48(%eax), %edi
; X86-SSE2-NEXT:    movl 52(%eax), %esi
; X86-SSE2-NEXT:    movl 56(%eax), %edx
; X86-SSE2-NEXT:    movl 60(%eax), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl (%eax), %eax
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    sarl $31, %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT:    andl $7, %eax
; X86-SSE2-NEXT:    movl 48(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 80(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT:    movl 92(%esp,%eax,8), %ebp
; X86-SSE2-NEXT:    movl 88(%esp,%eax,8), %ebx
; X86-SSE2-NEXT:    movl 100(%esp,%eax,8), %edi
; X86-SSE2-NEXT:    movl 96(%esp,%eax,8), %esi
; X86-SSE2-NEXT:    movl 108(%esp,%eax,8), %edx
; X86-SSE2-NEXT:    movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT:    movl %ecx, 56(%eax)
; X86-SSE2-NEXT:    movl %edx, 60(%eax)
; X86-SSE2-NEXT:    movl %esi, 48(%eax)
; X86-SSE2-NEXT:    movl %edi, 52(%eax)
; X86-SSE2-NEXT:    movl %ebx, 40(%eax)
; X86-SSE2-NEXT:    movl %ebp, 44(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 32(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 36(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 24(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 28(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 16(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 20(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 12(%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, (%eax)
; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
; X86-SSE2-NEXT:    addl $188, %esp
; X86-SSE2-NEXT:    popl %esi
; X86-SSE2-NEXT:    popl %edi
; X86-SSE2-NEXT:    popl %ebx
; X86-SSE2-NEXT:    popl %ebp
; X86-SSE2-NEXT:    retl
;
; X86-SSE42-LABEL: ashr_64bytes_qwordOff:
; X86-SSE42:       # %bb.0:
; X86-SSE42-NEXT:    pushl %ebx
; X86-SSE42-NEXT:    pushl %edi
; X86-SSE42-NEXT:    pushl %esi
; X86-SSE42-NEXT:    subl $128, %esp
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT:    movups (%edx), %xmm0
; X86-SSE42-NEXT:    movups 16(%edx), %xmm1
; X86-SSE42-NEXT:    movups 32(%edx), %xmm2
; X86-SSE42-NEXT:    movl 48(%edx), %esi
; X86-SSE42-NEXT:    movl 52(%edx), %edi
; X86-SSE42-NEXT:    movl 56(%edx), %ebx
; X86-SSE42-NEXT:    movl 60(%edx), %edx
; X86-SSE42-NEXT:    movl (%ecx), %ecx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movaps %xmm0, (%esp)
; X86-SSE42-NEXT:    sarl $31, %edx
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT:    andl $7, %ecx
; X86-SSE42-NEXT:    movups (%esp,%ecx,8), %xmm0
; X86-SSE42-NEXT:    movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT:    movups 32(%esp,%ecx,8), %xmm2
; X86-SSE42-NEXT:    movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT:    movups %xmm3, 48(%eax)
; X86-SSE42-NEXT:    movups %xmm2, 32(%eax)
; X86-SSE42-NEXT:    movups %xmm1, 16(%eax)
; X86-SSE42-NEXT:    movups %xmm0, (%eax)
; X86-SSE42-NEXT:    addl $128, %esp
; X86-SSE42-NEXT:    popl %esi
; X86-SSE42-NEXT:    popl %edi
; X86-SSE42-NEXT:    popl %ebx
; X86-SSE42-NEXT:    retl
;
; X86-AVX-LABEL: ashr_64bytes_qwordOff:
; X86-AVX:       # %bb.0:
; X86-AVX-NEXT:    pushl %ebx
; X86-AVX-NEXT:    pushl %edi
; X86-AVX-NEXT:    pushl %esi
; X86-AVX-NEXT:    subl $128, %esp
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT:    vmovups (%edx), %ymm0
; X86-AVX-NEXT:    vmovups 32(%edx), %xmm1
; X86-AVX-NEXT:    movl 48(%edx), %esi
; X86-AVX-NEXT:    movl 52(%edx), %edi
; X86-AVX-NEXT:    movl 56(%edx), %ebx
; X86-AVX-NEXT:    movl 60(%edx), %edx
; X86-AVX-NEXT:    movl (%ecx), %ecx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
; X86-AVX-NEXT:    sarl $31, %edx
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT:    andl $7, %ecx
; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
; X86-AVX-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX-NEXT:    vmovups %xmm3, 48(%eax)
; X86-AVX-NEXT:    vmovups %xmm2, 32(%eax)
; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
; X86-AVX-NEXT:    addl $128, %esp
; X86-AVX-NEXT:    popl %esi
; X86-AVX-NEXT:    popl %edi
; X86-AVX-NEXT:    popl %ebx
; X86-AVX-NEXT:    vzeroupper
; X86-AVX-NEXT:    retl
  %src = load i512, ptr %src.ptr, align 1
  %qwordOff = load i512, ptr %qwordOff.ptr, align 1
  %bitOff = shl i512 %qwordOff, 6
  %res = ashr i512 %src, %bitOff
  store i512 %res, ptr %dst, align 1
  ret void
}

;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
; X64: {{.*}}
; X86: {{.*}}