llvm/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-NO-SHLD,X64-NO-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-NO-BMI2,X64-SHLD,X64-NO-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-NO-SHLD,X64-HAVE-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-BMI2,X64-SHLD,X64-HAVE-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-NO-BMI2,X86-NO-SHLD,X86-NO-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-NO-BMI2,X86-SHLD,X86-NO-BMI2-HAVE-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-BMI2,X86-NO-SHLD,X86-HAVE-BMI2-NO-SHLD
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-BMI2,X86-SHLD,X86-HAVE-BMI2-HAVE-SHLD

define void @load_1byte_chunk_of_2byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movzbl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movzbl (%rdi), %eax
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movzbl (%eax), %eax
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %eax
; X86-NO-BMI2-NEXT:    movb %al, (%edx)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    movzbl (%edx), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT:    movb %cl, (%eax)
; X86-BMI2-NEXT:    retl
  %init1 = load i8, ptr %src, align 1
  %intermediate.sroa.0.0.vec.insert = insertelement <2 x i8> <i8 poison, i8 0>, i8 %init1, i64 0
  %intermediate.val.frozen = freeze <2 x i8> %intermediate.sroa.0.0.vec.insert
  %intermediate.val.frozen.bits = bitcast <2 x i8> %intermediate.val.frozen to i16
  %byteOff.tr = trunc i64 %byteOff to i16
  %byteOff.numbits.wide = shl i16 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i16 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i16 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_1byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movzwl (%eax), %eax
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %eax
; X86-NO-BMI2-NEXT:    movb %al, (%edx)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    movzwl (%edx), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT:    movb %cl, (%eax)
; X86-BMI2-NEXT:    retl
  %init = load <2 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <4 x i8> %intermediate.sroa.0.0.vec.expand, <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
  %intermediate.val.frozen = freeze <4 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
  %byteOff.tr = trunc i64 %byteOff to i32
  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_4byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrl %cl, %eax
; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    movzwl (%rdi), %eax
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    shrxl %esi, %eax, %eax
; X64-BMI2-NEXT:    movw %ax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-NO-BMI2:       # %bb.0:
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NEXT:    movzwl (%edx), %edx
; X86-NO-BMI2-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NEXT:    movw %dx, (%eax)
; X86-NO-BMI2-NEXT:    retl
;
; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half:
; X86-BMI2:       # %bb.0:
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-BMI2-NEXT:    movzwl (%edx), %edx
; X86-BMI2-NEXT:    shll $3, %ecx
; X86-BMI2-NEXT:    shrxl %ecx, %edx, %ecx
; X86-BMI2-NEXT:    movw %cx, (%eax)
; X86-BMI2-NEXT:    retl
  %init = load <2 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <4 x i8> %intermediate.sroa.0.0.vec.expand, <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
  %intermediate.val.frozen = freeze <4 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <4 x i8> %intermediate.val.frozen to i32
  %byteOff.tr = trunc i64 %byteOff to i32
  %byteOff.numbits.wide = shl i32 %byteOff.tr, 3
  %intermediate.val.frozen.bits.positioned = lshr i32 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i32 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_1byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    movl (%rdi), %eax
; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT:    movb %al, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movl (%edx), %edx
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorl %esi, %esi
; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-SHLD-NEXT:    testb $32, %cl
; X86-SHLD-NEXT:    cmovnel %esi, %edx
; X86-SHLD-NEXT:    movb %dl, (%eax)
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <4 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movw %ax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    movl (%rdi), %eax
; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT:    movw %ax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movl (%edx), %edx
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorl %esi, %esi
; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-SHLD-NEXT:    testb $32, %cl
; X86-SHLD-NEXT:    cmovnel %esi, %edx
; X86-SHLD-NEXT:    movw %dx, (%eax)
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movw %si, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <4 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_8byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-NO-BMI2:       # %bb.0:
; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT:    movl (%rdi), %eax
; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NEXT:    movl %eax, (%rdx)
; X64-NO-BMI2-NEXT:    retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X64-BMI2:       # %bb.0:
; X64-BMI2-NEXT:    shll $3, %esi
; X64-BMI2-NEXT:    movl (%rdi), %eax
; X64-BMI2-NEXT:    shrxq %rsi, %rax, %rax
; X64-BMI2-NEXT:    movl %eax, (%rdx)
; X64-BMI2-NEXT:    retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-NO-BMI2-NO-SHLD:       # %bb.0:
; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    testb $32, %cl
; X86-NO-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT:    retl
;
; X86-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-SHLD:       # %bb.0:
; X86-SHLD-NEXT:    pushl %esi
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-SHLD-NEXT:    movl (%edx), %edx
; X86-SHLD-NEXT:    shll $3, %ecx
; X86-SHLD-NEXT:    xorl %esi, %esi
; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
; X86-SHLD-NEXT:    testb $32, %cl
; X86-SHLD-NEXT:    cmovnel %esi, %edx
; X86-SHLD-NEXT:    movl %edx, (%eax)
; X86-SHLD-NEXT:    popl %esi
; X86-SHLD-NEXT:    retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %eax, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    testb $32, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT:    cmovel %edx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT:    retl
  %init = load <4 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <4 x i8> %init, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <8 x i8> %intermediate.sroa.0.0.vec.expand, <8 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <8 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <8 x i8> %intermediate.val.frozen to i64
  %intermediate.val.frozen.bits.positioned = lshr i64 %intermediate.val.frozen.bits, %byteOff.numbits
  %intermediate.val.frozen.bits.positioned.extracted = trunc i64 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    movb %sil, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-SHLD:       # %bb.0:
; X64-SHLD-NEXT:    movq %rsi, %rcx
; X64-SHLD-NEXT:    movq (%rdi), %rax
; X64-SHLD-NEXT:    shll $3, %ecx
; X64-SHLD-NEXT:    xorl %esi, %esi
; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-SHLD-NEXT:    testb $64, %cl
; X64-SHLD-NEXT:    cmovneq %rsi, %rax
; X64-SHLD-NEXT:    movb %al, (%rdx)
; X64-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $32, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    movss %xmm0, (%esp)
; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    andb $15, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movzbl (%esp,%ecx), %ecx
; X86-NEXT:    movb %cl, (%eax)
; X86-NEXT:    addl $32, %esp
; X86-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    movw %si, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-SHLD:       # %bb.0:
; X64-SHLD-NEXT:    movq %rsi, %rcx
; X64-SHLD-NEXT:    movq (%rdi), %rax
; X64-SHLD-NEXT:    shll $3, %ecx
; X64-SHLD-NEXT:    xorl %esi, %esi
; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-SHLD-NEXT:    testb $64, %cl
; X64-SHLD-NEXT:    cmovneq %rsi, %rax
; X64-SHLD-NEXT:    movw %ax, (%rdx)
; X64-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $32, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    movss %xmm0, (%esp)
; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    andb $15, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movw %cx, (%eax)
; X86-NEXT:    addl $32, %esp
; X86-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-SHLD:       # %bb.0:
; X64-SHLD-NEXT:    movq %rsi, %rcx
; X64-SHLD-NEXT:    movq (%rdi), %rax
; X64-SHLD-NEXT:    shll $3, %ecx
; X64-SHLD-NEXT:    xorl %esi, %esi
; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-SHLD-NEXT:    testb $64, %cl
; X64-SHLD-NEXT:    cmovneq %rsi, %rax
; X64-SHLD-NEXT:    movl %eax, (%rdx)
; X64-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $32, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    movss %xmm0, (%esp)
; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    andb $15, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, (%eax)
; X86-NEXT:    addl $32, %esp
; X86-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD:       # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, %rcx
; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT:    xorl %esi, %esi
; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %cl
; X64-NO-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT:    retq
;
; X64-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-SHLD:       # %bb.0:
; X64-SHLD-NEXT:    movq %rsi, %rcx
; X64-SHLD-NEXT:    movq (%rdi), %rax
; X64-SHLD-NEXT:    shll $3, %ecx
; X64-SHLD-NEXT:    xorl %esi, %esi
; X64-SHLD-NEXT:    shrdq %cl, %rsi, %rax
; X64-SHLD-NEXT:    testb $64, %cl
; X64-SHLD-NEXT:    cmovneq %rsi, %rax
; X64-SHLD-NEXT:    movq %rax, (%rdx)
; X64-SHLD-NEXT:    retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorl %ecx, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
;
; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $32, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    movss %xmm0, (%esp)
; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    andb $15, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %edx
; X86-NEXT:    movl 4(%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, 4(%eax)
; X86-NEXT:    movl %edx, (%eax)
; X86-NEXT:    addl $32, %esp
; X86-NEXT:    retl
  %init = load <8 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <16 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <16 x i8> %intermediate.val.frozen to i128
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i128
  %intermediate.val.frozen.bits.positioned = lshr i128 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i128 %intermediate.val.frozen.bits.positioned to i64
  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
  ret void
}

define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    shll $3, %esi
; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    shrb $3, %sil
; X64-NEXT:    movzbl %sil, %eax
; X64-NEXT:    movzbl -64(%rsp,%rax), %eax
; X64-NEXT:    movb %al, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $64, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movzbl (%esp,%ecx), %ecx
; X86-NEXT:    movb %cl, (%eax)
; X86-NEXT:    addl $64, %esp
; X86-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    shll $3, %esi
; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    shrb $3, %sil
; X64-NEXT:    movzbl %sil, %eax
; X64-NEXT:    movq -64(%rsp,%rax), %rax
; X64-NEXT:    movw %ax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $64, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movw %cx, (%eax)
; X86-NEXT:    addl $64, %esp
; X86-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    shll $3, %esi
; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    shrb $3, %sil
; X64-NEXT:    movzbl %sil, %eax
; X64-NEXT:    movl -64(%rsp,%rax), %eax
; X64-NEXT:    movl %eax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $64, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, (%eax)
; X86-NEXT:    addl $64, %esp
; X86-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    shll $3, %esi
; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    shrb $3, %sil
; X64-NEXT:    movzbl %sil, %eax
; X64-NEXT:    movq -64(%rsp,%rax), %rax
; X64-NEXT:    movq %rax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $64, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %edx
; X86-NEXT:    movl 4(%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, 4(%eax)
; X86-NEXT:    movl %edx, (%eax)
; X86-NEXT:    addl $64, %esp
; X86-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i64
  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
  ret void
}

define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    shll $3, %esi
; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    shrb $3, %sil
; X64-NEXT:    movzbl %sil, %eax
; X64-NEXT:    movq -64(%rsp,%rax), %rcx
; X64-NEXT:    movq -56(%rsp,%rax), %rax
; X64-NEXT:    movq %rax, 8(%rdx)
; X64-NEXT:    movq %rcx, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    pushl %edi
; X86-NEXT:    pushl %esi
; X86-NEXT:    subl $64, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    shll $3, %ecx
; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    shrb $3, %cl
; X86-NEXT:    movzbl %cl, %ecx
; X86-NEXT:    movl (%esp,%ecx), %edx
; X86-NEXT:    movl 4(%esp,%ecx), %esi
; X86-NEXT:    movl 8(%esp,%ecx), %edi
; X86-NEXT:    movl 12(%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, 12(%eax)
; X86-NEXT:    movl %edi, 8(%eax)
; X86-NEXT:    movl %esi, 4(%eax)
; X86-NEXT:    movl %edx, (%eax)
; X86-NEXT:    addl $64, %esp
; X86-NEXT:    popl %esi
; X86-NEXT:    popl %edi
; X86-NEXT:    retl
  %init = load <16 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <32 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <32 x i8> %intermediate.val.frozen to i256
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i256
  %intermediate.val.frozen.bits.positioned = lshr i256 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i256 %intermediate.val.frozen.bits.positioned to i128
  store i128 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 16
  ret void
}

define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movzbl -128(%rsp,%rsi), %eax
; X64-NEXT:    movb %al, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    movdqu 16(%edx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %ecx
; X86-NEXT:    movzbl (%esp,%ecx), %ecx
; X86-NEXT:    movb %cl, (%eax)
; X86-NEXT:    addl $128, %esp
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i8
  %1 = insertelement <1 x i8> poison, i8 %intermediate.val.frozen.bits.positioned.extracted, i64 0
  store <1 x i8> %1, ptr %dst, align 1
  ret void
}

define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movq -128(%rsp,%rsi), %rax
; X64-NEXT:    movw %ax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    movdqu 16(%edx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movw %cx, (%eax)
; X86-NEXT:    addl $128, %esp
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i16
  store i16 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 2
  ret void
}

define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movl -128(%rsp,%rsi), %eax
; X64-NEXT:    movl %eax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    movdqu 16(%edx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %ecx
; X86-NEXT:    movl (%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, (%eax)
; X86-NEXT:    addl $128, %esp
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i32
  store i32 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 4
  ret void
}

define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movq -128(%rsp,%rsi), %rax
; X64-NEXT:    movq %rax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    movdqu 16(%edx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %ecx
; X86-NEXT:    movl (%esp,%ecx), %edx
; X86-NEXT:    movl 4(%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, 4(%eax)
; X86-NEXT:    movl %edx, (%eax)
; X86-NEXT:    addl $128, %esp
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i64
  store i64 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 8
  ret void
}

define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movq -128(%rsp,%rsi), %rax
; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
; X64-NEXT:    movq %rcx, 8(%rdx)
; X64-NEXT:    movq %rax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    pushl %edi
; X86-NEXT:    pushl %esi
; X86-NEXT:    subl $128, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
; X86-NEXT:    movdqu (%edx), %xmm0
; X86-NEXT:    movdqu 16(%edx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, (%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %ecx
; X86-NEXT:    movl (%esp,%ecx), %edx
; X86-NEXT:    movl 4(%esp,%ecx), %esi
; X86-NEXT:    movl 8(%esp,%ecx), %edi
; X86-NEXT:    movl 12(%esp,%ecx), %ecx
; X86-NEXT:    movl %ecx, 12(%eax)
; X86-NEXT:    movl %edi, 8(%eax)
; X86-NEXT:    movl %esi, 4(%eax)
; X86-NEXT:    movl %edx, (%eax)
; X86-NEXT:    addl $128, %esp
; X86-NEXT:    popl %esi
; X86-NEXT:    popl %edi
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i128
  store i128 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 16
  ret void
}

define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64:       # %bb.0:
; X64-NEXT:    movdqu (%rdi), %xmm0
; X64-NEXT:    movdqu 16(%rdi), %xmm1
; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; X64-NEXT:    movq %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm3, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq %xmm2, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
; X64-NEXT:    andl $63, %esi
; X64-NEXT:    movq -128(%rsp,%rsi), %rax
; X64-NEXT:    movq -120(%rsp,%rsi), %rcx
; X64-NEXT:    movq -112(%rsp,%rsi), %rdi
; X64-NEXT:    movq -104(%rsp,%rsi), %rsi
; X64-NEXT:    movq %rsi, 24(%rdx)
; X64-NEXT:    movq %rdi, 16(%rdx)
; X64-NEXT:    movq %rcx, 8(%rdx)
; X64-NEXT:    movq %rax, (%rdx)
; X64-NEXT:    retq
;
; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86:       # %bb.0:
; X86-NEXT:    pushl %ebp
; X86-NEXT:    pushl %ebx
; X86-NEXT:    pushl %edi
; X86-NEXT:    pushl %esi
; X86-NEXT:    subl $136, %esp
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT:    movdqu (%ecx), %xmm0
; X86-NEXT:    movdqu 16(%ecx), %xmm1
; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; X86-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
; X86-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
; X86-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm7, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm6, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm5, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm4, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm3, {{[0-9]+}}(%esp)
; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
; X86-NEXT:    andl $63, %eax
; X86-NEXT:    movl 8(%esp,%eax), %ecx
; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT:    movl 12(%esp,%eax), %ecx
; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT:    movl 16(%esp,%eax), %esi
; X86-NEXT:    movl 20(%esp,%eax), %edi
; X86-NEXT:    movl 24(%esp,%eax), %ebx
; X86-NEXT:    movl 28(%esp,%eax), %ebp
; X86-NEXT:    movl 32(%esp,%eax), %edx
; X86-NEXT:    movl 36(%esp,%eax), %ecx
; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
; X86-NEXT:    movl %ecx, 28(%eax)
; X86-NEXT:    movl %edx, 24(%eax)
; X86-NEXT:    movl %ebp, 20(%eax)
; X86-NEXT:    movl %ebx, 16(%eax)
; X86-NEXT:    movl %edi, 12(%eax)
; X86-NEXT:    movl %esi, 8(%eax)
; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
; X86-NEXT:    movl %ecx, 4(%eax)
; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT:    movl %ecx, (%eax)
; X86-NEXT:    addl $136, %esp
; X86-NEXT:    popl %esi
; X86-NEXT:    popl %edi
; X86-NEXT:    popl %ebx
; X86-NEXT:    popl %ebp
; X86-NEXT:    retl
  %init = load <32 x i8>, ptr %src, align 1
  %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
  %byteOff.numbits = shl nuw nsw i64 %byteOff, 3
  %intermediate.val.frozen = freeze <64 x i8> %intermediate.sroa.0.0.vecblend
  %intermediate.val.frozen.bits = bitcast <64 x i8> %intermediate.val.frozen to i512
  %byteOff.numbits.wide = zext i64 %byteOff.numbits to i512
  %intermediate.val.frozen.bits.positioned = lshr i512 %intermediate.val.frozen.bits, %byteOff.numbits.wide
  %intermediate.val.frozen.bits.positioned.extracted = trunc i512 %intermediate.val.frozen.bits.positioned to i256
  store i256 %intermediate.val.frozen.bits.positioned.extracted, ptr %dst, align 32
  ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
; X64-NO-BMI2-HAVE-SHLD: {{.*}}
; X64-NO-SHLD: {{.*}}
; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}