llvm/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-NO-SHLD,X64-NO-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-SHLD,X64-NO-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-NO-SHLD,X64-HAVE-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-SHLD,X64-HAVE-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-NO-BMI2,X86-NO-SHLD,X86-NO-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-NO-BMI2,X86-SHLD,X86-NO-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-BMI2,X86-NO-SHLD,X86-HAVE-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-BMI2,X86-SHLD,X86-HAVE-BMI2-HAVE-SHLD

; no @load_1byte_chunk_of_1byte_alloca

define void @load_1byte_chunk_of_2byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %eax
; X86-NO-BMI2-NEXT:    movb %al, (%edx)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    movzwl (%edx), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT:    movb %cl, (%eax)
; X86-BMI2-NEXT:    retl
  %init = load <2 x i8>, ptr %src, align 1
  %intermediate.val.frozen = freeze <2 x i8> %init
  %intermediate.val.frozen.bits = bitcast <2 x i8> %intermediate.val.frozen to i16
  %byteOff.tr = trunc i64 %byteOff to i16
  %byteOff.numbits.wide = shl i16 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i16 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i16 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

; no @load_2byte_chunk_of_2byte_alloca

define void @load_1byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, (%rdi), %eax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl (%eax), %eax
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %eax
; X86-NO-BMI2-NEXT:    movb %al, (%edx)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
; X86-BMI2-NEXT:    movb %cl, (%eax)
; X86-BMI2-NEXT:    retl
  %init = load <4 x i8>, ptr %src, align 1
  %intermediate.val.frozen = freeze <4 x i8> %init
  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
  %byteOff.tr = trunc i64 %byteOff to i32
  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, (%rdi), %eax
; X64-BMI2-NEXT:    movw %ax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NEXT:    movw %dx, (%eax)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, (%edx), %ecx
; X86-BMI2-NEXT:    movw %cx, (%eax)
; X86-BMI2-NEXT:    retl
  %init = load <4 x i8>, ptr %src, align 1
  %intermediate.val.frozen = freeze <4 x i8> %init
  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
  %byteOff.tr = trunc i64 %byteOff to i32
  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

; no @load_4byte_chunk_of_4byte_alloca

define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %esi, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %dl, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %bl, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %init
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-BMI2-NEXT:    movw %ax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %init
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-BMI2-NEXT:    movl %eax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %al
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
; X86-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movd %xmm0, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %esi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    cmovel %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %init
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

; no @load_8byte_chunk_of_8byte_alloca

define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movb %sil, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %ebx
; X86-SHLD-NEXT:    subl $40, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $3, %dl
; X86-SHLD-NEXT:    andb $12, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx), %ebx
; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
; X86-SHLD-NEXT:    movb %bl, (%eax)
; X86-SHLD-NEXT:    addl $40, %esp
; X86-SHLD-NEXT:    popl %ebx
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %init
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movw %si, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $40, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $3, %dl
; X86-SHLD-NEXT:    andb $12, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
; X86-SHLD-NEXT:    movw %si, (%eax)
; X86-SHLD-NEXT:    addl $40, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %init
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $40, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $3, %dl
; X86-SHLD-NEXT:    andb $12, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx), %esi
; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
; X86-SHLD-NEXT:    movl %esi, (%eax)
; X86-SHLD-NEXT:    addl $40, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %init
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %r8, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %xmm0, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    testb $64, %cl
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %ebx
; X86-SHLD-NEXT:    pushl %edi
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $32, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $3, %dl
; X86-SHLD-NEXT:    andb $12, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
; X86-SHLD-NEXT:    movl %edx, %ebx
; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
; X86-SHLD-NEXT:    movl %edi, (%eax)
; X86-SHLD-NEXT:    addl $32, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    popl %edi
; X86-SHLD-NEXT:    popl %ebx
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm1, %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %init
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i64
  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
  ret void
}

; no @load_16byte_chunk_of_16byte_alloca

define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movl %ecx, %eax
; X64-NO-BMI2-NEXT:    shrb $6, %al
; X64-NO-BMI2-NEXT:    movzbl %al, %eax
; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movl %esi, %eax
; X64-BMI2-NEXT:    shrb $6, %al
; X64-BMI2-NEXT:    movzbl %al, %eax
; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %ebx
; X86-SHLD-NEXT:    subl $72, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $5, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx,4), %ebx
; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
; X86-SHLD-NEXT:    movb %bl, (%eax)
; X86-SHLD-NEXT:    addl $72, %esp
; X86-SHLD-NEXT:    popl %ebx
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %init
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movl %ecx, %eax
; X64-NO-BMI2-NEXT:    shrb $6, %al
; X64-NO-BMI2-NEXT:    movzbl %al, %eax
; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT:    addl %eax, %eax
; X64-NO-BMI2-NEXT:    andb $56, %cl
; X64-NO-BMI2-NEXT:    notb %cl
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shlq %cl, %rax
; X64-NO-BMI2-NEXT:    orl %esi, %eax
; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movl %esi, %eax
; X64-BMI2-NEXT:    shrb $6, %al
; X64-BMI2-NEXT:    movzbl %al, %eax
; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT:    andb $56, %sil
; X64-BMI2-NEXT:    notb %sil
; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
; X64-BMI2-NEXT:    addl %eax, %eax
; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT:    orl %eax, %ecx
; X64-BMI2-NEXT:    movw %cx, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movw %dx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $72, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $5, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
; X86-SHLD-NEXT:    movw %si, (%eax)
; X86-SHLD-NEXT:    addl $72, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %init
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NEXT:    movl %ecx, %eax
; X64-NO-BMI2-NEXT:    shrb $6, %al
; X64-NO-BMI2-NEXT:    movzbl %al, %eax
; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT:    addl %eax, %eax
; X64-NO-BMI2-NEXT:    andb $56, %cl
; X64-NO-BMI2-NEXT:    notb %cl
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shlq %cl, %rax
; X64-NO-BMI2-NEXT:    orl %esi, %eax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movups (%rdi), %xmm0
; X64-BMI2-NEXT:    movups 16(%rdi), %xmm1
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    xorps %xmm2, %xmm2
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT:    movl %esi, %eax
; X64-BMI2-NEXT:    shrb $6, %al
; X64-BMI2-NEXT:    movzbl %al, %eax
; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT:    andb $56, %sil
; X64-BMI2-NEXT:    notb %sil
; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
; X64-BMI2-NEXT:    addl %eax, %eax
; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT:    orl %eax, %ecx
; X64-BMI2-NEXT:    movl %ecx, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $72, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $5, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl (%esp,%edx,4), %esi
; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
; X86-SHLD-NEXT:    movl %esi, (%eax)
; X86-SHLD-NEXT:    addl $72, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %init
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %al
; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-SHLD:       # %bb.0:
; X64-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-SHLD-NEXT:    leal (,%rsi,8), %ecx
; X64-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SHLD-NEXT:    movl %ecx, %eax
; X64-SHLD-NEXT:    shrb $6, %al
; X64-SHLD-NEXT:    movzbl %al, %eax
; X64-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
; X64-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
; X64-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-SHLD-NEXT:    shrdq %cl, %rax, %rsi
; X64-SHLD-NEXT:    movq %rsi, (%rdx)
; X64-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $76, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx,4), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %al
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %al
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%edx)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $76, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %ebx
; X86-SHLD-NEXT:    pushl %edi
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $64, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movups (%edx), %xmm0
; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-SHLD-NEXT:    movl %ecx, %edx
; X86-SHLD-NEXT:    shrb $5, %dl
; X86-SHLD-NEXT:    movzbl %dl, %edx
; X86-SHLD-NEXT:    movl 8(%esp,%edx,4), %esi
; X86-SHLD-NEXT:    movl (%esp,%edx,4), %edi
; X86-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
; X86-SHLD-NEXT:    movl %edx, %ebx
; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
; X86-SHLD-NEXT:    movl %edi, (%eax)
; X86-SHLD-NEXT:    addl $64, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    popl %edi
; X86-SHLD-NEXT:    popl %ebx
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $76, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%edx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%edx), %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edx,4), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $76, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %init
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i64
  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
  ret void
}

define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi,8), %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rdi,8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    notb %sil
; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rdi,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-HAVE-SHLD:       # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrb $6, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %cl, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rsi,8), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rsi,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rsi,8), %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %al, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rdi, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD:       # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm2, %xmm2
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax,8), %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rax,8), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -56(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    subl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi,4), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %ch
; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT:    addl $92, %esp
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %ebp
; X86-SHLD-NEXT:    pushl %ebx
; X86-SHLD-NEXT:    pushl %edi
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    subl $92, %esp
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movups (%eax), %xmm0
; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SHLD-NEXT:    movl %ecx, %eax
; X86-SHLD-NEXT:    shrb $5, %al
; X86-SHLD-NEXT:    movzbl %al, %ebx
; X86-SHLD-NEXT:    movl 24(%esp,%ebx,4), %esi
; X86-SHLD-NEXT:    movl 16(%esp,%ebx,4), %eax
; X86-SHLD-NEXT:    movl 20(%esp,%ebx,4), %edi
; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
; X86-SHLD-NEXT:    movl 28(%esp,%ebx,4), %ebp
; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
; X86-SHLD-NEXT:    movl 32(%esp,%ebx,4), %ebx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    shrdl %cl, %ebx, %ebp
; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
; X86-SHLD-NEXT:    movl %esi, 8(%edx)
; X86-SHLD-NEXT:    movl %edi, 4(%edx)
; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
; X86-SHLD-NEXT:    movl %eax, (%edx)
; X86-SHLD-NEXT:    addl $92, %esp
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    popl %edi
; X86-SHLD-NEXT:    popl %ebx
; X86-SHLD-NEXT:    popl %ebp
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, 16(%esp,%ecx,4), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%ecx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%ecx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%ecx,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ecx,4), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 8(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %init
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i128
  store i128 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 16
  ret void
}

; no @load_32byte_chunk_of_32byte_alloca
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
; X86: {{.*}}
; X86-NO-SHLD: {{.*}}