llvm/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=-sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASELINE
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,-sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE1
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+xop < %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP

; https://bugs.llvm.org/show_bug.cgi?id=37104

; All the advanced stuff (negative tests, commutativity) is handled in the
; scalar version of the test only.

; ============================================================================ ;
; 8-bit vector width
; ============================================================================ ;

define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
; CHECK-LABEL: out_v1i8:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edx, %eax
; CHECK-NEXT:    andl %edx, %edi
; CHECK-NEXT:    notb %al
; CHECK-NEXT:    andb %sil, %al
; CHECK-NEXT:    orb %dil, %al
; CHECK-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-NEXT:    retq
  %mx = and <1 x i8> %x, %mask
  %notmask = xor <1 x i8> %mask, <i8 -1>
  %my = and <1 x i8> %y, %notmask
  %r = or <1 x i8> %mx, %my
  ret <1 x i8> %r
}

; ============================================================================ ;
; 16-bit vector width
; ============================================================================ ;

define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v2i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %r8d, %eax
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    andl %r8d, %edi
; CHECK-BASELINE-NEXT:    notb %al
; CHECK-BASELINE-NEXT:    notb %r9b
; CHECK-BASELINE-NEXT:    andb %cl, %r9b
; CHECK-BASELINE-NEXT:    andb %dl, %al
; CHECK-BASELINE-NEXT:    orb %dil, %al
; CHECK-BASELINE-NEXT:    orb %sil, %r9b
; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-BASELINE-NEXT:    movl %r9d, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v2i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %r8d, %eax
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    andl %r8d, %edi
; CHECK-SSE1-NEXT:    notb %al
; CHECK-SSE1-NEXT:    notb %r9b
; CHECK-SSE1-NEXT:    andb %cl, %r9b
; CHECK-SSE1-NEXT:    andb %dl, %al
; CHECK-SSE1-NEXT:    orb %dil, %al
; CHECK-SSE1-NEXT:    orb %sil, %r9b
; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-SSE1-NEXT:    movl %r9d, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v2i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v2i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <2 x i8> %x, %mask
  %notmask = xor <2 x i8> %mask, <i8 -1, i8 -1>
  %my = and <2 x i8> %y, %notmask
  %r = or <2 x i8> %mx, %my
  ret <2 x i8> %r
}

define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
; CHECK-LABEL: out_v1i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edx, %eax
; CHECK-NEXT:    andl %edx, %edi
; CHECK-NEXT:    notl %eax
; CHECK-NEXT:    andl %esi, %eax
; CHECK-NEXT:    orl %edi, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    retq
  %mx = and <1 x i16> %x, %mask
  %notmask = xor <1 x i16> %mask, <i16 -1>
  %my = and <1 x i16> %y, %notmask
  %r = or <1 x i16> %mx, %my
  ret <1 x i16> %r
}

; ============================================================================ ;
; 32-bit vector width
; ============================================================================ ;

define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r9b, %sil
; CHECK-SSE1-NEXT:    xorb %r11b, %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    xorb %r11b, %dl
; CHECK-SSE1-NEXT:    xorb %r10b, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %r10b, %cl
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
; CHECK-SSE1-NEXT:    movb %sil, (%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <4 x i8> %x, %mask
  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
  %my = and <4 x i8> %y, %notmask
  %r = or <4 x i8> %mx, %my
  ret <4 x i8> %r
}

define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i8_undef:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i8_undef:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r9b, %sil
; CHECK-SSE1-NEXT:    xorb %r10b, %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    xorb %r10b, %dl
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
; CHECK-SSE1-NEXT:    movb %sil, (%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i8_undef:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i8_undef:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <4 x i8> %x, %mask
  %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
  %my = and <4 x i8> %y, %notmask
  %r = or <4 x i8> %mx, %my
  ret <4 x i8> %r
}

define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v2i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %r8d, %eax
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    andl %r8d, %edi
; CHECK-BASELINE-NEXT:    notl %eax
; CHECK-BASELINE-NEXT:    notl %r9d
; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
; CHECK-BASELINE-NEXT:    orl %esi, %r9d
; CHECK-BASELINE-NEXT:    andl %edx, %eax
; CHECK-BASELINE-NEXT:    orl %edi, %eax
; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-BASELINE-NEXT:    movl %r9d, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v2i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %r8d, %eax
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    andl %r8d, %edi
; CHECK-SSE1-NEXT:    notl %eax
; CHECK-SSE1-NEXT:    notl %r9d
; CHECK-SSE1-NEXT:    andl %ecx, %r9d
; CHECK-SSE1-NEXT:    orl %esi, %r9d
; CHECK-SSE1-NEXT:    andl %edx, %eax
; CHECK-SSE1-NEXT:    orl %edi, %eax
; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-SSE1-NEXT:    movl %r9d, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v2i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v2i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <2 x i16> %x, %mask
  %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
  %my = and <2 x i16> %y, %notmask
  %r = or <2 x i16> %mx, %my
  ret <2 x i16> %r
}

define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
; CHECK-LABEL: out_v1i32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edi, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    andl %edx, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    retq
  %mx = and <1 x i32> %x, %mask
  %notmask = xor <1 x i32> %mask, <i32 -1>
  %my = and <1 x i32> %y, %notmask
  %r = or <1 x i32> %mx, %my
  ret <1 x i32> %r
}

; ============================================================================ ;
; 64-bit vector width
; ============================================================================ ;

define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v8i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    xorb %r12b, %sil
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r12b, %sil
; CHECK-BASELINE-NEXT:    xorb %r15b, %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    xorb %r15b, %dl
; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-BASELINE-NEXT:    xorb %r10b, %r11b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
; CHECK-BASELINE-NEXT:    movb %r10b, 7(%rax)
; CHECK-BASELINE-NEXT:    movb %r11b, 6(%rax)
; CHECK-BASELINE-NEXT:    movb %bl, 5(%rax)
; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rax)
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v8i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    xorb %r12b, %sil
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r12b, %sil
; CHECK-SSE1-NEXT:    xorb %r15b, %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    xorb %r15b, %dl
; CHECK-SSE1-NEXT:    xorb %r14b, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %r14b, %cl
; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
; CHECK-SSE1-NEXT:    xorb %bl, %r9b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-SSE1-NEXT:    xorb %bl, %r9b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorb %r11b, %bl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT:    xorb %r11b, %bl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-SSE1-NEXT:    xorb %r10b, %r11b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    xorb %dil, %r10b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-SSE1-NEXT:    xorb %dil, %r10b
; CHECK-SSE1-NEXT:    movb %r10b, 7(%rax)
; CHECK-SSE1-NEXT:    movb %r11b, 6(%rax)
; CHECK-SSE1-NEXT:    movb %bl, 5(%rax)
; CHECK-SSE1-NEXT:    movb %r9b, 4(%rax)
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
; CHECK-SSE1-NEXT:    movb %sil, (%rax)
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v8i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v8i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <8 x i8> %x, %mask
  %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %my = and <8 x i8> %y, %notmask
  %r = or <8 x i8> %mx, %my
  ret <8 x i8> %r
}

define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    movw %si, (%rax)
; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorl %r11d, %edx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT:    xorl %r11d, %edx
; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    movw %si, (%rax)
; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <4 x i16> %x, %mask
  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
  %my = and <4 x i16> %y, %notmask
  %r = or <4 x i16> %mx, %my
  ret <4 x i16> %r
}

define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i16_undef:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT:    movw %si, (%rax)
; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i16_undef:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT:    xorl %r10d, %edx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT:    xorl %r10d, %edx
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
; CHECK-SSE1-NEXT:    movw %si, (%rax)
; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i16_undef:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i16_undef:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <4 x i16> %x, %mask
  %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
  %my = and <4 x i16> %y, %notmask
  %r = or <4 x i16> %mx, %my
  ret <4 x i16> %r
}

define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v2i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %edi, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    andl %r8d, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    movl %esi, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v2i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %edi, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    andl %r8d, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    movl %esi, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v2i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v2i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <2 x i32> %x, %mask
  %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
  %my = and <2 x i32> %y, %notmask
  %r = or <2 x i32> %mx, %my
  ret <2 x i32> %r
}

define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
; CHECK-LABEL: out_v1i64:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    xorq %rsi, %rax
; CHECK-NEXT:    andq %rdx, %rax
; CHECK-NEXT:    xorq %rsi, %rax
; CHECK-NEXT:    retq
  %mx = and <1 x i64> %x, %mask
  %notmask = xor <1 x i64> %mask, <i64 -1>
  %my = and <1 x i64> %y, %notmask
  %r = or <1 x i64> %mx, %my
  ret <1 x i64> %r
}

; ============================================================================ ;
; 128-bit vector width
; ============================================================================ ;

define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v16i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movl %edx, %r11d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %r13b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-BASELINE-NEXT:    xorb %al, %r13b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %r15b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT:    xorb %al, %r15b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
; CHECK-BASELINE-NEXT:    xorb %r8b, %al
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-BASELINE-NEXT:    xorb %r8b, %al
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
; CHECK-BASELINE-NEXT:    movb %r10b, 15(%rdi)
; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdi)
; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdi)
; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rdi)
; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
; CHECK-BASELINE-NEXT:    movb %cl, 9(%rdi)
; CHECK-BASELINE-NEXT:    movb %dl, 8(%rdi)
; CHECK-BASELINE-NEXT:    movb %sil, 7(%rdi)
; CHECK-BASELINE-NEXT:    movb %r12b, 6(%rdi)
; CHECK-BASELINE-NEXT:    movb %r14b, 5(%rdi)
; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v16i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movl %edx, %r11d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    xorb %r10b, %sil
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r10b, %sil
; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    xorb %dl, %r11b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-SSE1-NEXT:    xorb %dl, %r11b
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    xorb %bl, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    xorb %bl, %r8b
; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-SSE1-NEXT:    xorb %r15b, %sil
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r15b, %sil
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
; CHECK-SSE1-NEXT:    xorb %r13b, %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    xorb %r13b, %dl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %r13b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-SSE1-NEXT:    xorb %al, %r13b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %r15b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT:    xorb %al, %r15b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
; CHECK-SSE1-NEXT:    xorb %r8b, %al
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-SSE1-NEXT:    xorb %r8b, %al
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
; CHECK-SSE1-NEXT:    movb %r10b, 15(%rdi)
; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
; CHECK-SSE1-NEXT:    movb %bl, 13(%rdi)
; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdi)
; CHECK-SSE1-NEXT:    movb %r15b, 11(%rdi)
; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
; CHECK-SSE1-NEXT:    movb %cl, 9(%rdi)
; CHECK-SSE1-NEXT:    movb %dl, 8(%rdi)
; CHECK-SSE1-NEXT:    movb %sil, 7(%rdi)
; CHECK-SSE1-NEXT:    movb %r12b, 6(%rdi)
; CHECK-SSE1-NEXT:    movb %r14b, 5(%rdi)
; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movb %al, (%rdi)
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v16i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v16i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <16 x i8> %x, %mask
  %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %my = and <16 x i8> %y, %notmask
  %r = or <16 x i8> %mx, %my
  ret <16 x i8> %r
}

define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v8i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
; CHECK-BASELINE-NEXT:    xorl %r15d, %edx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT:    xorl %r15d, %edx
; CHECK-BASELINE-NEXT:    xorl %r14d, %ecx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT:    xorl %r14d, %ecx
; CHECK-BASELINE-NEXT:    xorl %ebp, %r8d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT:    xorl %ebp, %r8d
; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
; CHECK-BASELINE-NEXT:    movl %edi, %r10d
; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
; CHECK-BASELINE-NEXT:    movw %bx, 10(%rax)
; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT:    movw %si, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v8i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    xorl %r12d, %esi
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT:    xorl %r12d, %esi
; CHECK-SSE1-NEXT:    xorl %r15d, %edx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT:    xorl %r15d, %edx
; CHECK-SSE1-NEXT:    xorl %r14d, %ecx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT:    xorl %r14d, %ecx
; CHECK-SSE1-NEXT:    xorl %ebp, %r8d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT:    xorl %ebp, %r8d
; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
; CHECK-SSE1-NEXT:    movl %r11d, %ebx
; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
; CHECK-SSE1-NEXT:    movl %r10d, %r11d
; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
; CHECK-SSE1-NEXT:    movl %edi, %r10d
; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
; CHECK-SSE1-NEXT:    xorl %edi, %r10d
; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
; CHECK-SSE1-NEXT:    movw %bx, 10(%rax)
; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
; CHECK-SSE1-NEXT:    movw %si, (%rax)
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v8i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v8i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <8 x i16> %x, %mask
  %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
  %my = and <8 x i16> %y, %notmask
  %r = or <8 x i16> %mx, %my
  ret <8 x i16> %r
}

define <4 x i32> @out_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
; CHECK-BASELINE-NEXT:    xorl %r10d, %r9d
; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r10d
; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r10d
; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
; CHECK-BASELINE-NEXT:    movl %r10d, 8(%rax)
; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %x = load <4 x i32>, ptr%px, align 16
  %y = load <4 x i32>, ptr%py, align 16
  %mask = load <4 x i32>, ptr%pmask, align 16
  %mx = and <4 x i32> %x, %mask
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
  %my = and <4 x i32> %y, %notmask
  %r = or <4 x i32> %mx, %my
  ret <4 x i32> %r
}

define <4 x i32> @out_v4i32_undef(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i32_undef:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl 8(%rsi), %edi
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
; CHECK-BASELINE-NEXT:    andl 8(%rcx), %edi
; CHECK-BASELINE-NEXT:    movl (%rsi), %r10d
; CHECK-BASELINE-NEXT:    xorl %r9d, %r10d
; CHECK-BASELINE-NEXT:    andl (%rcx), %r10d
; CHECK-BASELINE-NEXT:    xorl %r9d, %r10d
; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
; CHECK-BASELINE-NEXT:    xorl %edx, %r9d
; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
; CHECK-BASELINE-NEXT:    xorl %edx, %r9d
; CHECK-BASELINE-NEXT:    movl 12(%rsi), %edx
; CHECK-BASELINE-NEXT:    xorl %r8d, %edx
; CHECK-BASELINE-NEXT:    andl 12(%rcx), %edx
; CHECK-BASELINE-NEXT:    xorl %r8d, %edx
; CHECK-BASELINE-NEXT:    movl %edi, 8(%rax)
; CHECK-BASELINE-NEXT:    movl %edx, 12(%rax)
; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
; CHECK-BASELINE-NEXT:    movl %r10d, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i32_undef:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
; CHECK-SSE1-NEXT:    movaps (%rsi), %xmm1
; CHECK-SSE1-NEXT:    andps %xmm0, %xmm1
; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm0
; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i32_undef:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm1
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i32_undef:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %x = load <4 x i32>, ptr%px, align 16
  %y = load <4 x i32>, ptr%py, align 16
  %mask = load <4 x i32>, ptr%pmask, align 16
  %mx = and <4 x i32> %x, %mask
  %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
  %my = and <4 x i32> %y, %notmask
  %r = or <4 x i32> %mx, %my
  ret <4 x i32> %r
}

define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
; CHECK-BASELINE-LABEL: out_v2i64:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
; CHECK-BASELINE-NEXT:    andq %r8, %rax
; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
; CHECK-BASELINE-NEXT:    andq %r9, %rsi
; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v2i64:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    xorq %rdx, %rax
; CHECK-SSE1-NEXT:    andq %r8, %rax
; CHECK-SSE1-NEXT:    xorq %rdx, %rax
; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
; CHECK-SSE1-NEXT:    andq %r9, %rsi
; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
; CHECK-SSE1-NEXT:    movq %rsi, %rdx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v2i64:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v2i64:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %mx = and <2 x i64> %x, %mask
  %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
  %my = and <2 x i64> %y, %notmask
  %r = or <2 x i64> %mx, %my
  ret <2 x i64> %r
}

; ============================================================================ ;
; 256-bit vector width
; ============================================================================ ;

define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v32i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rcx, %r10
; CHECK-BASELINE-NEXT:    movq %rdx, %r8
; CHECK-BASELINE-NEXT:    movq %rsi, %r9
; CHECK-BASELINE-NEXT:    movq %rdi, %r11
; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 13(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 12(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 11(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r14d
; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r15d
; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %r12d
; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %r13d
; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %edx
; CHECK-BASELINE-NEXT:    movzbl 2(%r8), %edi
; CHECK-BASELINE-NEXT:    movzbl (%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 1(%r8), %ecx
; CHECK-BASELINE-NEXT:    movzbl (%r9), %ebx
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    andb (%r10), %bl
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 1(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 1(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 2(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %dil, %al
; CHECK-BASELINE-NEXT:    andb 2(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %dil, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 3(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %dl, %al
; CHECK-BASELINE-NEXT:    andb 3(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %dl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 4(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %sil, %al
; CHECK-BASELINE-NEXT:    andb 4(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %sil, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 5(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %r13b, %al
; CHECK-BASELINE-NEXT:    andb 5(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %r13b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 6(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %r12b, %al
; CHECK-BASELINE-NEXT:    andb 6(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %r12b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 7(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %r15b, %al
; CHECK-BASELINE-NEXT:    andb 7(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %r15b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 8(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %r14b, %al
; CHECK-BASELINE-NEXT:    andb 8(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %r14b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 9(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %bpl, %al
; CHECK-BASELINE-NEXT:    andb 9(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %bpl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 10(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 10(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 11(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 11(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 12(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 12(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 13(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 13(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 14(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 14(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 15(%r9), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb 15(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 16(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 16(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 16(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 17(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 17(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 17(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 18(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 18(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 18(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 19(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 19(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 19(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 20(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 20(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 20(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 21(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 21(%r9), %r13d
; CHECK-BASELINE-NEXT:    xorb %al, %r13b
; CHECK-BASELINE-NEXT:    andb 21(%r10), %r13b
; CHECK-BASELINE-NEXT:    xorb %al, %r13b
; CHECK-BASELINE-NEXT:    movzbl 22(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 22(%r9), %r12d
; CHECK-BASELINE-NEXT:    xorb %al, %r12b
; CHECK-BASELINE-NEXT:    andb 22(%r10), %r12b
; CHECK-BASELINE-NEXT:    xorb %al, %r12b
; CHECK-BASELINE-NEXT:    movzbl 23(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 23(%r9), %r15d
; CHECK-BASELINE-NEXT:    xorb %al, %r15b
; CHECK-BASELINE-NEXT:    andb 23(%r10), %r15b
; CHECK-BASELINE-NEXT:    xorb %al, %r15b
; CHECK-BASELINE-NEXT:    movzbl 24(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 24(%r9), %r14d
; CHECK-BASELINE-NEXT:    xorb %al, %r14b
; CHECK-BASELINE-NEXT:    andb 24(%r10), %r14b
; CHECK-BASELINE-NEXT:    xorb %al, %r14b
; CHECK-BASELINE-NEXT:    movzbl 25(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 25(%r9), %ebp
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    andb 25(%r10), %bpl
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    movzbl 26(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 26(%r9), %edi
; CHECK-BASELINE-NEXT:    xorb %al, %dil
; CHECK-BASELINE-NEXT:    andb 26(%r10), %dil
; CHECK-BASELINE-NEXT:    xorb %al, %dil
; CHECK-BASELINE-NEXT:    movzbl 27(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 27(%r9), %esi
; CHECK-BASELINE-NEXT:    xorb %al, %sil
; CHECK-BASELINE-NEXT:    andb 27(%r10), %sil
; CHECK-BASELINE-NEXT:    xorb %al, %sil
; CHECK-BASELINE-NEXT:    movzbl 28(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 28(%r9), %edx
; CHECK-BASELINE-NEXT:    xorb %al, %dl
; CHECK-BASELINE-NEXT:    andb 28(%r10), %dl
; CHECK-BASELINE-NEXT:    xorb %al, %dl
; CHECK-BASELINE-NEXT:    movzbl 29(%r8), %eax
; CHECK-BASELINE-NEXT:    movzbl 29(%r9), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 29(%r10), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movzbl 30(%r8), %ebx
; CHECK-BASELINE-NEXT:    movzbl 30(%r9), %eax
; CHECK-BASELINE-NEXT:    xorb %bl, %al
; CHECK-BASELINE-NEXT:    andb 30(%r10), %al
; CHECK-BASELINE-NEXT:    xorb %bl, %al
; CHECK-BASELINE-NEXT:    movzbl 31(%r8), %r8d
; CHECK-BASELINE-NEXT:    movzbl 31(%r9), %r9d
; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
; CHECK-BASELINE-NEXT:    andb 31(%r10), %r9b
; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
; CHECK-BASELINE-NEXT:    movb %r9b, 31(%r11)
; CHECK-BASELINE-NEXT:    movb %al, 30(%r11)
; CHECK-BASELINE-NEXT:    movb %cl, 29(%r11)
; CHECK-BASELINE-NEXT:    movb %dl, 28(%r11)
; CHECK-BASELINE-NEXT:    movb %sil, 27(%r11)
; CHECK-BASELINE-NEXT:    movb %dil, 26(%r11)
; CHECK-BASELINE-NEXT:    movb %bpl, 25(%r11)
; CHECK-BASELINE-NEXT:    movb %r14b, 24(%r11)
; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r11)
; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r11)
; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 20(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 19(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 18(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 17(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 16(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 15(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 14(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 13(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 12(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 11(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 10(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 9(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 8(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 7(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 6(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 5(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 4(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 3(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 2(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 1(%r11)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, (%r11)
; CHECK-BASELINE-NEXT:    movq %r11, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v32i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rcx, %r10
; CHECK-SSE1-NEXT:    movq %rdx, %r8
; CHECK-SSE1-NEXT:    movq %rsi, %r9
; CHECK-SSE1-NEXT:    movq %rdi, %r11
; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 13(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 12(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 11(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %ebp
; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r14d
; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r15d
; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %r12d
; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %r13d
; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %edx
; CHECK-SSE1-NEXT:    movzbl 2(%r8), %edi
; CHECK-SSE1-NEXT:    movzbl (%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 1(%r8), %ecx
; CHECK-SSE1-NEXT:    movzbl (%r9), %ebx
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    andb (%r10), %bl
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 1(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 1(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 2(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %dil, %al
; CHECK-SSE1-NEXT:    andb 2(%r10), %al
; CHECK-SSE1-NEXT:    xorb %dil, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 3(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %dl, %al
; CHECK-SSE1-NEXT:    andb 3(%r10), %al
; CHECK-SSE1-NEXT:    xorb %dl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 4(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %sil, %al
; CHECK-SSE1-NEXT:    andb 4(%r10), %al
; CHECK-SSE1-NEXT:    xorb %sil, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 5(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %r13b, %al
; CHECK-SSE1-NEXT:    andb 5(%r10), %al
; CHECK-SSE1-NEXT:    xorb %r13b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 6(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %r12b, %al
; CHECK-SSE1-NEXT:    andb 6(%r10), %al
; CHECK-SSE1-NEXT:    xorb %r12b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 7(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %r15b, %al
; CHECK-SSE1-NEXT:    andb 7(%r10), %al
; CHECK-SSE1-NEXT:    xorb %r15b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 8(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %r14b, %al
; CHECK-SSE1-NEXT:    andb 8(%r10), %al
; CHECK-SSE1-NEXT:    xorb %r14b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 9(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %bpl, %al
; CHECK-SSE1-NEXT:    andb 9(%r10), %al
; CHECK-SSE1-NEXT:    xorb %bpl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 10(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 10(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 11(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 11(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 12(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 12(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 13(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 13(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 14(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 14(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 15(%r9), %eax
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb 15(%r10), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 16(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 16(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 16(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 17(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 17(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 17(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 18(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 18(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 18(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 19(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 19(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 19(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 20(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 20(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 20(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 21(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 21(%r9), %r13d
; CHECK-SSE1-NEXT:    xorb %al, %r13b
; CHECK-SSE1-NEXT:    andb 21(%r10), %r13b
; CHECK-SSE1-NEXT:    xorb %al, %r13b
; CHECK-SSE1-NEXT:    movzbl 22(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 22(%r9), %r12d
; CHECK-SSE1-NEXT:    xorb %al, %r12b
; CHECK-SSE1-NEXT:    andb 22(%r10), %r12b
; CHECK-SSE1-NEXT:    xorb %al, %r12b
; CHECK-SSE1-NEXT:    movzbl 23(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 23(%r9), %r15d
; CHECK-SSE1-NEXT:    xorb %al, %r15b
; CHECK-SSE1-NEXT:    andb 23(%r10), %r15b
; CHECK-SSE1-NEXT:    xorb %al, %r15b
; CHECK-SSE1-NEXT:    movzbl 24(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 24(%r9), %r14d
; CHECK-SSE1-NEXT:    xorb %al, %r14b
; CHECK-SSE1-NEXT:    andb 24(%r10), %r14b
; CHECK-SSE1-NEXT:    xorb %al, %r14b
; CHECK-SSE1-NEXT:    movzbl 25(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 25(%r9), %ebp
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    andb 25(%r10), %bpl
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    movzbl 26(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 26(%r9), %edi
; CHECK-SSE1-NEXT:    xorb %al, %dil
; CHECK-SSE1-NEXT:    andb 26(%r10), %dil
; CHECK-SSE1-NEXT:    xorb %al, %dil
; CHECK-SSE1-NEXT:    movzbl 27(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 27(%r9), %esi
; CHECK-SSE1-NEXT:    xorb %al, %sil
; CHECK-SSE1-NEXT:    andb 27(%r10), %sil
; CHECK-SSE1-NEXT:    xorb %al, %sil
; CHECK-SSE1-NEXT:    movzbl 28(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 28(%r9), %edx
; CHECK-SSE1-NEXT:    xorb %al, %dl
; CHECK-SSE1-NEXT:    andb 28(%r10), %dl
; CHECK-SSE1-NEXT:    xorb %al, %dl
; CHECK-SSE1-NEXT:    movzbl 29(%r8), %eax
; CHECK-SSE1-NEXT:    movzbl 29(%r9), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 29(%r10), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movzbl 30(%r8), %ebx
; CHECK-SSE1-NEXT:    movzbl 30(%r9), %eax
; CHECK-SSE1-NEXT:    xorb %bl, %al
; CHECK-SSE1-NEXT:    andb 30(%r10), %al
; CHECK-SSE1-NEXT:    xorb %bl, %al
; CHECK-SSE1-NEXT:    movzbl 31(%r8), %r8d
; CHECK-SSE1-NEXT:    movzbl 31(%r9), %r9d
; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
; CHECK-SSE1-NEXT:    andb 31(%r10), %r9b
; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
; CHECK-SSE1-NEXT:    movb %r9b, 31(%r11)
; CHECK-SSE1-NEXT:    movb %al, 30(%r11)
; CHECK-SSE1-NEXT:    movb %cl, 29(%r11)
; CHECK-SSE1-NEXT:    movb %dl, 28(%r11)
; CHECK-SSE1-NEXT:    movb %sil, 27(%r11)
; CHECK-SSE1-NEXT:    movb %dil, 26(%r11)
; CHECK-SSE1-NEXT:    movb %bpl, 25(%r11)
; CHECK-SSE1-NEXT:    movb %r14b, 24(%r11)
; CHECK-SSE1-NEXT:    movb %r15b, 23(%r11)
; CHECK-SSE1-NEXT:    movb %r12b, 22(%r11)
; CHECK-SSE1-NEXT:    movb %r13b, 21(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 20(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 19(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 18(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 17(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 16(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 15(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 14(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 13(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 12(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 11(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 10(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 9(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 8(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 7(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 6(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 5(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 4(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 3(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 2(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 1(%r11)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, (%r11)
; CHECK-SSE1-NEXT:    movq %r11, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v32i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v32i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <32 x i8>, ptr%px, align 32
  %y = load <32 x i8>, ptr%py, align 32
  %mask = load <32 x i8>, ptr%pmask, align 32
  %mx = and <32 x i8> %x, %mask
  %notmask = xor <32 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %my = and <32 x i8> %y, %notmask
  %r = or <32 x i8> %mx, %my
  ret <32 x i8> %r
}

define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v16i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %bx, %ax
; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %bp, %ax
; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorw %dx, %si
; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
; CHECK-BASELINE-NEXT:    xorl %esi, %edx
; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v16i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r8w, %ax
; CHECK-SSE1-NEXT:    andw (%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r8d
; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r12w, %ax
; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r12d
; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r9w, %ax
; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r9d
; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r10w, %ax
; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r10d
; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r11w, %ax
; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r11d
; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r13w, %ax
; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r13d
; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %bx, %ax
; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %ebx
; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %bp, %ax
; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %ebp
; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r14w, %ax
; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r14d
; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r15w, %ax
; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r15d
; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r13w, %ax
; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r13d
; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r9w, %ax
; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r9d
; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r8w, %ax
; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
; CHECK-SSE1-NEXT:    xorl %eax, %r8d
; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
; CHECK-SSE1-NEXT:    xorw %ax, %r10w
; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
; CHECK-SSE1-NEXT:    xorl %r10d, %eax
; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
; CHECK-SSE1-NEXT:    xorw %dx, %si
; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
; CHECK-SSE1-NEXT:    xorl %esi, %edx
; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v16i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v16i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <16 x i16>, ptr%px, align 32
  %y = load <16 x i16>, ptr%py, align 32
  %mask = load <16 x i16>, ptr%pmask, align 32
  %mx = and <16 x i16> %x, %mask
  %notmask = xor <16 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
  %my = and <16 x i16> %y, %notmask
  %r = or <16 x i16> %mx, %my
  ret <16 x i16> %r
}

define <8 x i32> @out_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v8i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl 24(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r14d
; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r11d
; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
; CHECK-BASELINE-NEXT:    xorl %r11d, %r9d
; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
; CHECK-BASELINE-NEXT:    xorl %r11d, %r9d
; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
; CHECK-BASELINE-NEXT:    xorl %ebp, %r11d
; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r11d
; CHECK-BASELINE-NEXT:    xorl %ebp, %r11d
; CHECK-BASELINE-NEXT:    movl 12(%rsi), %ebp
; CHECK-BASELINE-NEXT:    xorl %r14d, %ebp
; CHECK-BASELINE-NEXT:    andl 12(%rcx), %ebp
; CHECK-BASELINE-NEXT:    xorl %r14d, %ebp
; CHECK-BASELINE-NEXT:    movl 16(%rsi), %r14d
; CHECK-BASELINE-NEXT:    xorl %ebx, %r14d
; CHECK-BASELINE-NEXT:    andl 16(%rcx), %r14d
; CHECK-BASELINE-NEXT:    xorl %ebx, %r14d
; CHECK-BASELINE-NEXT:    movl 20(%rsi), %ebx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ebx
; CHECK-BASELINE-NEXT:    andl 20(%rcx), %ebx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ebx
; CHECK-BASELINE-NEXT:    movl 24(%rsi), %r10d
; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
; CHECK-BASELINE-NEXT:    andl 24(%rcx), %r10d
; CHECK-BASELINE-NEXT:    xorl %r8d, %r10d
; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    movl %esi, 28(%rax)
; CHECK-BASELINE-NEXT:    movl %r10d, 24(%rax)
; CHECK-BASELINE-NEXT:    movl %ebx, 20(%rax)
; CHECK-BASELINE-NEXT:    movl %r14d, 16(%rax)
; CHECK-BASELINE-NEXT:    movl %ebp, 12(%rax)
; CHECK-BASELINE-NEXT:    movl %r11d, 8(%rax)
; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rax)
; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v8i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
; CHECK-SSE1-NEXT:    movl 24(%rdx), %r8d
; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
; CHECK-SSE1-NEXT:    movl 12(%rdx), %r14d
; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebp
; CHECK-SSE1-NEXT:    movl (%rdx), %r9d
; CHECK-SSE1-NEXT:    movl 4(%rdx), %r11d
; CHECK-SSE1-NEXT:    movl (%rsi), %edx
; CHECK-SSE1-NEXT:    xorl %r9d, %edx
; CHECK-SSE1-NEXT:    andl (%rcx), %edx
; CHECK-SSE1-NEXT:    xorl %r9d, %edx
; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
; CHECK-SSE1-NEXT:    xorl %r11d, %r9d
; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
; CHECK-SSE1-NEXT:    xorl %r11d, %r9d
; CHECK-SSE1-NEXT:    movl 8(%rsi), %r11d
; CHECK-SSE1-NEXT:    xorl %ebp, %r11d
; CHECK-SSE1-NEXT:    andl 8(%rcx), %r11d
; CHECK-SSE1-NEXT:    xorl %ebp, %r11d
; CHECK-SSE1-NEXT:    movl 12(%rsi), %ebp
; CHECK-SSE1-NEXT:    xorl %r14d, %ebp
; CHECK-SSE1-NEXT:    andl 12(%rcx), %ebp
; CHECK-SSE1-NEXT:    xorl %r14d, %ebp
; CHECK-SSE1-NEXT:    movl 16(%rsi), %r14d
; CHECK-SSE1-NEXT:    xorl %ebx, %r14d
; CHECK-SSE1-NEXT:    andl 16(%rcx), %r14d
; CHECK-SSE1-NEXT:    xorl %ebx, %r14d
; CHECK-SSE1-NEXT:    movl 20(%rsi), %ebx
; CHECK-SSE1-NEXT:    xorl %r10d, %ebx
; CHECK-SSE1-NEXT:    andl 20(%rcx), %ebx
; CHECK-SSE1-NEXT:    xorl %r10d, %ebx
; CHECK-SSE1-NEXT:    movl 24(%rsi), %r10d
; CHECK-SSE1-NEXT:    xorl %r8d, %r10d
; CHECK-SSE1-NEXT:    andl 24(%rcx), %r10d
; CHECK-SSE1-NEXT:    xorl %r8d, %r10d
; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
; CHECK-SSE1-NEXT:    xorl %edi, %esi
; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
; CHECK-SSE1-NEXT:    xorl %edi, %esi
; CHECK-SSE1-NEXT:    movl %esi, 28(%rax)
; CHECK-SSE1-NEXT:    movl %r10d, 24(%rax)
; CHECK-SSE1-NEXT:    movl %ebx, 20(%rax)
; CHECK-SSE1-NEXT:    movl %r14d, 16(%rax)
; CHECK-SSE1-NEXT:    movl %ebp, 12(%rax)
; CHECK-SSE1-NEXT:    movl %r11d, 8(%rax)
; CHECK-SSE1-NEXT:    movl %r9d, 4(%rax)
; CHECK-SSE1-NEXT:    movl %edx, (%rax)
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v8i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v8i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <8 x i32>, ptr%px, align 32
  %y = load <8 x i32>, ptr%py, align 32
  %mask = load <8 x i32>, ptr%pmask, align 32
  %mx = and <8 x i32> %x, %mask
  %notmask = xor <8 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
  %my = and <8 x i32> %y, %notmask
  %r = or <8 x i32> %mx, %my
  ret <8 x i32> %r
}

define <4 x i64> @out_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: out_v4i64:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movq 24(%rdx), %rdi
; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r8
; CHECK-BASELINE-NEXT:    movq (%rdx), %r9
; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
; CHECK-BASELINE-NEXT:    movq 8(%rsi), %r9
; CHECK-BASELINE-NEXT:    xorq %r10, %r9
; CHECK-BASELINE-NEXT:    andq 8(%rcx), %r9
; CHECK-BASELINE-NEXT:    xorq %r10, %r9
; CHECK-BASELINE-NEXT:    movq 16(%rsi), %r10
; CHECK-BASELINE-NEXT:    xorq %r8, %r10
; CHECK-BASELINE-NEXT:    andq 16(%rcx), %r10
; CHECK-BASELINE-NEXT:    xorq %r8, %r10
; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
; CHECK-BASELINE-NEXT:    movq %r10, 16(%rax)
; CHECK-BASELINE-NEXT:    movq %r9, 8(%rax)
; CHECK-BASELINE-NEXT:    movq %rdx, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: out_v4i64:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movq 24(%rdx), %rdi
; CHECK-SSE1-NEXT:    movq 16(%rdx), %r8
; CHECK-SSE1-NEXT:    movq (%rdx), %r9
; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
; CHECK-SSE1-NEXT:    xorq %r9, %rdx
; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
; CHECK-SSE1-NEXT:    xorq %r9, %rdx
; CHECK-SSE1-NEXT:    movq 8(%rsi), %r9
; CHECK-SSE1-NEXT:    xorq %r10, %r9
; CHECK-SSE1-NEXT:    andq 8(%rcx), %r9
; CHECK-SSE1-NEXT:    xorq %r10, %r9
; CHECK-SSE1-NEXT:    movq 16(%rsi), %r10
; CHECK-SSE1-NEXT:    xorq %r8, %r10
; CHECK-SSE1-NEXT:    andq 16(%rcx), %r10
; CHECK-SSE1-NEXT:    xorq %r8, %r10
; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
; CHECK-SSE1-NEXT:    movq %r10, 16(%rax)
; CHECK-SSE1-NEXT:    movq %r9, 8(%rax)
; CHECK-SSE1-NEXT:    movq %rdx, (%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: out_v4i64:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps 16(%rdi), %xmm2
; CHECK-SSE2-NEXT:    andps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    movaps (%rdi), %xmm3
; CHECK-SSE2-NEXT:    andps %xmm0, %xmm3
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm3, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: out_v4i64:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <4 x i64>, ptr%px, align 32
  %y = load <4 x i64>, ptr%py, align 32
  %mask = load <4 x i64>, ptr%pmask, align 32
  %mx = and <4 x i64> %x, %mask
  %notmask = xor <4 x i64> %mask, <i64 -1, i64 -1, i64 -1, i64 -1>
  %my = and <4 x i64> %y, %notmask
  %r = or <4 x i64> %mx, %my
  ret <4 x i64> %r
}

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Should be the same as the previous one.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; ============================================================================ ;
; 8-bit vector width
; ============================================================================ ;

define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
; CHECK-LABEL: in_v1i8:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edi, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    andl %edx, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-NEXT:    retq
  %n0 = xor <1 x i8> %x, %y
  %n1 = and <1 x i8> %n0, %mask
  %r = xor <1 x i8> %n1, %y
  ret <1 x i8> %r
}

; ============================================================================ ;
; 16-bit vector width
; ============================================================================ ;

define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v2i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %edi, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    andl %r8d, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-BASELINE-NEXT:    movl %esi, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v2i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %edi, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    andl %r8d, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
; CHECK-SSE1-NEXT:    movl %esi, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v2i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v2i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <2 x i8> %x, %y
  %n1 = and <2 x i8> %n0, %mask
  %r = xor <2 x i8> %n1, %y
  ret <2 x i8> %r
}

define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
; CHECK-LABEL: in_v1i16:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edi, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    andl %edx, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-NEXT:    retq
  %n0 = xor <1 x i16> %x, %y
  %n1 = and <1 x i16> %n0, %mask
  %r = xor <1 x i16> %n1, %y
  ret <1 x i16> %r
}

; ============================================================================ ;
; 32-bit vector width
; ============================================================================ ;

define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v4i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v4i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    xorb %r11b, %dl
; CHECK-SSE1-NEXT:    xorb %r10b, %cl
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    xorb %r9b, %sil
; CHECK-SSE1-NEXT:    xorb %r11b, %dl
; CHECK-SSE1-NEXT:    xorb %r10b, %cl
; CHECK-SSE1-NEXT:    xorb %dil, %r8b
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
; CHECK-SSE1-NEXT:    movb %sil, (%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v4i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v4i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <4 x i8> %x, %y
  %n1 = and <4 x i8> %n0, %mask
  %r = xor <4 x i8> %n1, %y
  ret <4 x i8> %r
}

define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v2i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %edi, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    andl %r8d, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-BASELINE-NEXT:    movl %esi, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v2i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %edi, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    andl %r8d, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
; CHECK-SSE1-NEXT:    movl %esi, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v2i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v2i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <2 x i16> %x, %y
  %n1 = and <2 x i16> %n0, %mask
  %r = xor <2 x i16> %n1, %y
  ret <2 x i16> %r
}

define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
; CHECK-LABEL: in_v1i32:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movl %edi, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    andl %edx, %eax
; CHECK-NEXT:    xorl %esi, %eax
; CHECK-NEXT:    retq
  %n0 = xor <1 x i32> %x, %y
  %n1 = and <1 x i32> %n0, %mask
  %r = xor <1 x i32> %n1, %y
  ret <1 x i32> %r
}

; ============================================================================ ;
; 64-bit vector width
; ============================================================================ ;

define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v8i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
; CHECK-BASELINE-NEXT:    xorb %r12b, %dl
; CHECK-BASELINE-NEXT:    xorb %r14b, %cl
; CHECK-BASELINE-NEXT:    xorb %bpl, %r8b
; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
; CHECK-BASELINE-NEXT:    movb %r13b, 6(%rdi)
; CHECK-BASELINE-NEXT:    movb %r15b, 5(%rdi)
; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v8i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorb %r11b, %sil
; CHECK-SSE1-NEXT:    xorb %r12b, %dl
; CHECK-SSE1-NEXT:    xorb %r14b, %cl
; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
; CHECK-SSE1-NEXT:    xorb %bl, %r9b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT:    xorb %r11b, %sil
; CHECK-SSE1-NEXT:    xorb %r12b, %dl
; CHECK-SSE1-NEXT:    xorb %r14b, %cl
; CHECK-SSE1-NEXT:    xorb %bpl, %r8b
; CHECK-SSE1-NEXT:    xorb %bl, %r9b
; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
; CHECK-SSE1-NEXT:    movb %r13b, 6(%rdi)
; CHECK-SSE1-NEXT:    movb %r15b, 5(%rdi)
; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v8i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v8i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <8 x i8> %x, %y
  %n1 = and <8 x i8> %n0, %mask
  %r = xor <8 x i8> %n1, %y
  ret <8 x i8> %r
}

define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v4i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
; CHECK-BASELINE-NEXT:    xorl %r10d, %ecx
; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT:    movw %si, (%rax)
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v4i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    xorl %r11d, %edx
; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT:    xorl %r9d, %esi
; CHECK-SSE1-NEXT:    xorl %r11d, %edx
; CHECK-SSE1-NEXT:    xorl %r10d, %ecx
; CHECK-SSE1-NEXT:    xorl %edi, %r8d
; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
; CHECK-SSE1-NEXT:    movw %si, (%rax)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v4i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v4i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <4 x i16> %x, %y
  %n1 = and <4 x i16> %n0, %mask
  %r = xor <4 x i16> %n1, %y
  ret <4 x i16> %r
}

define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v2i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movl %edi, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    andl %r9d, %esi
; CHECK-BASELINE-NEXT:    andl %r8d, %eax
; CHECK-BASELINE-NEXT:    xorl %edx, %eax
; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
; CHECK-BASELINE-NEXT:    movl %esi, %edx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v2i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movl %edi, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    andl %r9d, %esi
; CHECK-SSE1-NEXT:    andl %r8d, %eax
; CHECK-SSE1-NEXT:    xorl %edx, %eax
; CHECK-SSE1-NEXT:    xorl %ecx, %esi
; CHECK-SSE1-NEXT:    movl %esi, %edx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v2i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v2i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <2 x i32> %x, %y
  %n1 = and <2 x i32> %n0, %mask
  %r = xor <2 x i32> %n1, %y
  ret <2 x i32> %r
}

define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
; CHECK-LABEL: in_v1i64:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    xorq %rsi, %rax
; CHECK-NEXT:    andq %rdx, %rax
; CHECK-NEXT:    xorq %rsi, %rax
; CHECK-NEXT:    retq
  %n0 = xor <1 x i64> %x, %y
  %n1 = and <1 x i64> %n0, %mask
  %r = xor <1 x i64> %n1, %y
  ret <1 x i64> %r
}

; ============================================================================ ;
; 128-bit vector width
; ============================================================================ ;

define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v16i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movq %rdi, %rdx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-BASELINE-NEXT:    xorb %r13b, %r11b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-BASELINE-NEXT:    xorb %r12b, %r13b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
; CHECK-BASELINE-NEXT:    xorb %r15b, %r12b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-BASELINE-NEXT:    xorb %r14b, %r15b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
; CHECK-BASELINE-NEXT:    xorb %bpl, %r14b
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-BASELINE-NEXT:    xorb %bl, %bpl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
; CHECK-BASELINE-NEXT:    xorb %bl, %bpl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-BASELINE-NEXT:    xorb %cl, %al
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-BASELINE-NEXT:    xorb %sil, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %sil, %cl
; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdx)
; CHECK-BASELINE-NEXT:    movb %al, 14(%rdx)
; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdx)
; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdx)
; CHECK-BASELINE-NEXT:    movb %r14b, 11(%rdx)
; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rdx)
; CHECK-BASELINE-NEXT:    movb %r12b, 9(%rdx)
; CHECK-BASELINE-NEXT:    movb %r13b, 8(%rdx)
; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdx)
; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdx)
; CHECK-BASELINE-NEXT:    movb %dil, 5(%rdx)
; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdx)
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    xorb %al, %r8b
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-BASELINE-NEXT:    xorb %al, %r8b
; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdx)
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdx)
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdx)
; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, (%rdx)
; CHECK-BASELINE-NEXT:    movq %rdx, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v16i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movq %rdi, %rdx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    xorb %dil, %r9b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
; CHECK-SSE1-NEXT:    xorb %dil, %r9b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    xorb %r10b, %dil
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
; CHECK-SSE1-NEXT:    xorb %r10b, %dil
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
; CHECK-SSE1-NEXT:    xorb %r13b, %r11b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
; CHECK-SSE1-NEXT:    xorb %r12b, %r13b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
; CHECK-SSE1-NEXT:    xorb %r15b, %r12b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
; CHECK-SSE1-NEXT:    xorb %r14b, %r15b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
; CHECK-SSE1-NEXT:    xorb %bpl, %r14b
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
; CHECK-SSE1-NEXT:    xorb %bl, %bpl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
; CHECK-SSE1-NEXT:    xorb %bl, %bpl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
; CHECK-SSE1-NEXT:    xorb %cl, %al
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
; CHECK-SSE1-NEXT:    xorb %sil, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %sil, %cl
; CHECK-SSE1-NEXT:    movb %cl, 15(%rdx)
; CHECK-SSE1-NEXT:    movb %al, 14(%rdx)
; CHECK-SSE1-NEXT:    movb %bl, 13(%rdx)
; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdx)
; CHECK-SSE1-NEXT:    movb %r14b, 11(%rdx)
; CHECK-SSE1-NEXT:    movb %r15b, 10(%rdx)
; CHECK-SSE1-NEXT:    movb %r12b, 9(%rdx)
; CHECK-SSE1-NEXT:    movb %r13b, 8(%rdx)
; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdx)
; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdx)
; CHECK-SSE1-NEXT:    movb %dil, 5(%rdx)
; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdx)
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    xorb %al, %r8b
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
; CHECK-SSE1-NEXT:    xorb %al, %r8b
; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdx)
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, 2(%rdx)
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, 1(%rdx)
; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, (%rdx)
; CHECK-SSE1-NEXT:    movq %rdx, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v16i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v16i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <16 x i8> %x, %y
  %n1 = and <16 x i8> %n0, %mask
  %r = xor <16 x i8> %n1, %y
  ret <16 x i8> %r
}

define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v8i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-BASELINE-NEXT:    xorl %ebx, %esi
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-BASELINE-NEXT:    xorl %ebx, %edx
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-BASELINE-NEXT:    xorl %ebx, %ecx
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-BASELINE-NEXT:    xorl %ebx, %r8d
; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-BASELINE-NEXT:    xorw %di, %r10w
; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
; CHECK-BASELINE-NEXT:    movw %r11w, 12(%rax)
; CHECK-BASELINE-NEXT:    movw %bx, 10(%rax)
; CHECK-BASELINE-NEXT:    movw %r9w, 8(%rax)
; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
; CHECK-BASELINE-NEXT:    movw %si, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v8i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorl %ebx, %esi
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
; CHECK-SSE1-NEXT:    xorl %ebx, %esi
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorl %ebx, %edx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
; CHECK-SSE1-NEXT:    xorl %ebx, %edx
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
; CHECK-SSE1-NEXT:    xorl %ebx, %ecx
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
; CHECK-SSE1-NEXT:    xorl %ebx, %r8d
; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
; CHECK-SSE1-NEXT:    xorw %r11w, %bx
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
; CHECK-SSE1-NEXT:    xorw %di, %r10w
; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
; CHECK-SSE1-NEXT:    xorl %edi, %r10d
; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
; CHECK-SSE1-NEXT:    movw %r11w, 12(%rax)
; CHECK-SSE1-NEXT:    movw %bx, 10(%rax)
; CHECK-SSE1-NEXT:    movw %r9w, 8(%rax)
; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
; CHECK-SSE1-NEXT:    movw %si, (%rax)
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v8i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v8i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <8 x i16> %x, %y
  %n1 = and <8 x i16> %n0, %mask
  %r = xor <8 x i16> %n1, %y
  ret <8 x i16> %r
}

define <4 x i32> @in_v4i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: in_v4i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movl (%rdx), %r9d
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movl (%rsi), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r11d
; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
; CHECK-BASELINE-NEXT:    movl 8(%rsi), %ebx
; CHECK-BASELINE-NEXT:    xorl %r8d, %ebx
; CHECK-BASELINE-NEXT:    movl 12(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    andl 12(%rcx), %esi
; CHECK-BASELINE-NEXT:    andl 8(%rcx), %ebx
; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r11d
; CHECK-BASELINE-NEXT:    andl (%rcx), %edx
; CHECK-BASELINE-NEXT:    xorl %r9d, %edx
; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
; CHECK-BASELINE-NEXT:    xorl %r8d, %ebx
; CHECK-BASELINE-NEXT:    xorl %edi, %esi
; CHECK-BASELINE-NEXT:    movl %esi, 12(%rax)
; CHECK-BASELINE-NEXT:    movl %ebx, 8(%rax)
; CHECK-BASELINE-NEXT:    movl %r11d, 4(%rax)
; CHECK-BASELINE-NEXT:    movl %edx, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v4i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movaps (%rcx), %xmm0
; CHECK-SSE1-NEXT:    movaps %xmm0, %xmm1
; CHECK-SSE1-NEXT:    andnps (%rdx), %xmm1
; CHECK-SSE1-NEXT:    andps (%rsi), %xmm0
; CHECK-SSE1-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE1-NEXT:    movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v4i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm1
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm1
; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm1, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v4i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %xmm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %xmm1
; CHECK-XOP-NEXT:    vpcmov %xmm1, (%rsi), %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %x = load <4 x i32>, ptr%px, align 16
  %y = load <4 x i32>, ptr%py, align 16
  %mask = load <4 x i32>, ptr%pmask, align 16
  %n0 = xor <4 x i32> %x, %y
  %n1 = and <4 x i32> %n0, %mask
  %r = xor <4 x i32> %n1, %y
  ret <4 x i32> %r
}

define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
; CHECK-BASELINE-LABEL: in_v2i64:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
; CHECK-BASELINE-NEXT:    andq %r9, %rsi
; CHECK-BASELINE-NEXT:    andq %r8, %rax
; CHECK-BASELINE-NEXT:    xorq %rdx, %rax
; CHECK-BASELINE-NEXT:    xorq %rcx, %rsi
; CHECK-BASELINE-NEXT:    movq %rsi, %rdx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v2i64:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    xorq %rdx, %rax
; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
; CHECK-SSE1-NEXT:    andq %r9, %rsi
; CHECK-SSE1-NEXT:    andq %r8, %rax
; CHECK-SSE1-NEXT:    xorq %rdx, %rax
; CHECK-SSE1-NEXT:    xorq %rcx, %rsi
; CHECK-SSE1-NEXT:    movq %rsi, %rdx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v2i64:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    andps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    andnps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v2i64:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT:    retq
  %n0 = xor <2 x i64> %x, %y
  %n1 = and <2 x i64> %n0, %mask
  %r = xor <2 x i64> %n1, %y
  ret <2 x i64> %r
}

; ============================================================================ ;
; 256-bit vector width
; ============================================================================ ;

define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: in_v32i8:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rcx, %r12
; CHECK-BASELINE-NEXT:    movq %rdx, %r15
; CHECK-BASELINE-NEXT:    movq %rsi, %r14
; CHECK-BASELINE-NEXT:    movq %rdi, %r13
; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 13(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 12(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 11(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r9d
; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %edi
; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %eax
; CHECK-BASELINE-NEXT:    movzbl 2(%rdx), %ecx
; CHECK-BASELINE-NEXT:    movzbl (%rdx), %r11d
; CHECK-BASELINE-NEXT:    movzbl 1(%rdx), %edx
; CHECK-BASELINE-NEXT:    movzbl (%r14), %ebx
; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
; CHECK-BASELINE-NEXT:    andb (%r12), %bl
; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 1(%r14), %r11d
; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
; CHECK-BASELINE-NEXT:    andb 1(%r12), %r11b
; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 2(%r14), %edx
; CHECK-BASELINE-NEXT:    xorb %cl, %dl
; CHECK-BASELINE-NEXT:    andb 2(%r12), %dl
; CHECK-BASELINE-NEXT:    xorb %cl, %dl
; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 3(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 3(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 4(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %sil, %al
; CHECK-BASELINE-NEXT:    andb 4(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %sil, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 5(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %dil, %al
; CHECK-BASELINE-NEXT:    andb 5(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %dil, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 6(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %bpl, %al
; CHECK-BASELINE-NEXT:    andb 6(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %bpl, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 7(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    andb 7(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 8(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %r9b, %al
; CHECK-BASELINE-NEXT:    andb 8(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %r9b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 9(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %r8b, %al
; CHECK-BASELINE-NEXT:    andb 9(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %r8b, %al
; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 10(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 10(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 11(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 11(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 12(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 12(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 13(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 13(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 14(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 14(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 15(%r14), %ecx
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 15(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 16(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 16(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 16(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 17(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 17(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 17(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 18(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 18(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 18(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 19(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 19(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 19(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 20(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 20(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 20(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT:    movzbl 21(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 21(%r14), %ebp
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    andb 21(%r12), %bpl
; CHECK-BASELINE-NEXT:    xorb %al, %bpl
; CHECK-BASELINE-NEXT:    movzbl 22(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 22(%r14), %ebx
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    andb 22(%r12), %bl
; CHECK-BASELINE-NEXT:    xorb %al, %bl
; CHECK-BASELINE-NEXT:    movzbl 23(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 23(%r14), %r11d
; CHECK-BASELINE-NEXT:    xorb %al, %r11b
; CHECK-BASELINE-NEXT:    andb 23(%r12), %r11b
; CHECK-BASELINE-NEXT:    xorb %al, %r11b
; CHECK-BASELINE-NEXT:    movzbl 24(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 24(%r14), %r9d
; CHECK-BASELINE-NEXT:    xorb %al, %r9b
; CHECK-BASELINE-NEXT:    andb 24(%r12), %r9b
; CHECK-BASELINE-NEXT:    xorb %al, %r9b
; CHECK-BASELINE-NEXT:    movzbl 25(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 25(%r14), %r8d
; CHECK-BASELINE-NEXT:    xorb %al, %r8b
; CHECK-BASELINE-NEXT:    andb 25(%r12), %r8b
; CHECK-BASELINE-NEXT:    xorb %al, %r8b
; CHECK-BASELINE-NEXT:    movzbl 26(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 26(%r14), %edi
; CHECK-BASELINE-NEXT:    xorb %al, %dil
; CHECK-BASELINE-NEXT:    andb 26(%r12), %dil
; CHECK-BASELINE-NEXT:    xorb %al, %dil
; CHECK-BASELINE-NEXT:    movzbl 27(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 27(%r14), %esi
; CHECK-BASELINE-NEXT:    xorb %al, %sil
; CHECK-BASELINE-NEXT:    andb 27(%r12), %sil
; CHECK-BASELINE-NEXT:    xorb %al, %sil
; CHECK-BASELINE-NEXT:    movzbl 28(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 28(%r14), %edx
; CHECK-BASELINE-NEXT:    xorb %al, %dl
; CHECK-BASELINE-NEXT:    andb 28(%r12), %dl
; CHECK-BASELINE-NEXT:    xorb %al, %dl
; CHECK-BASELINE-NEXT:    movzbl 29(%r15), %eax
; CHECK-BASELINE-NEXT:    movzbl 29(%r14), %ecx
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    andb 29(%r12), %cl
; CHECK-BASELINE-NEXT:    xorb %al, %cl
; CHECK-BASELINE-NEXT:    movzbl 30(%r15), %r10d
; CHECK-BASELINE-NEXT:    movzbl 30(%r14), %eax
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    andb 30(%r12), %al
; CHECK-BASELINE-NEXT:    xorb %r10b, %al
; CHECK-BASELINE-NEXT:    movzbl 31(%r15), %r10d
; CHECK-BASELINE-NEXT:    movzbl 31(%r14), %r14d
; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
; CHECK-BASELINE-NEXT:    andb 31(%r12), %r14b
; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
; CHECK-BASELINE-NEXT:    movb %r14b, 31(%r13)
; CHECK-BASELINE-NEXT:    movb %al, 30(%r13)
; CHECK-BASELINE-NEXT:    movb %cl, 29(%r13)
; CHECK-BASELINE-NEXT:    movb %dl, 28(%r13)
; CHECK-BASELINE-NEXT:    movb %sil, 27(%r13)
; CHECK-BASELINE-NEXT:    movb %dil, 26(%r13)
; CHECK-BASELINE-NEXT:    movb %r8b, 25(%r13)
; CHECK-BASELINE-NEXT:    movb %r9b, 24(%r13)
; CHECK-BASELINE-NEXT:    movb %r11b, 23(%r13)
; CHECK-BASELINE-NEXT:    movb %bl, 22(%r13)
; CHECK-BASELINE-NEXT:    movb %bpl, 21(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 20(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 19(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 18(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 17(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 16(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 15(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 14(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 13(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 12(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 11(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 10(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 9(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 8(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 7(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 6(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 5(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 4(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 3(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 2(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, 1(%r13)
; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT:    movb %al, (%r13)
; CHECK-BASELINE-NEXT:    movq %r13, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v32i8:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rcx, %r12
; CHECK-SSE1-NEXT:    movq %rdx, %r15
; CHECK-SSE1-NEXT:    movq %rsi, %r14
; CHECK-SSE1-NEXT:    movq %rdi, %r13
; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 13(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 12(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 11(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %r8d
; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r9d
; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r10d
; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %ebp
; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %edi
; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %eax
; CHECK-SSE1-NEXT:    movzbl 2(%rdx), %ecx
; CHECK-SSE1-NEXT:    movzbl (%rdx), %r11d
; CHECK-SSE1-NEXT:    movzbl 1(%rdx), %edx
; CHECK-SSE1-NEXT:    movzbl (%r14), %ebx
; CHECK-SSE1-NEXT:    xorb %r11b, %bl
; CHECK-SSE1-NEXT:    andb (%r12), %bl
; CHECK-SSE1-NEXT:    xorb %r11b, %bl
; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 1(%r14), %r11d
; CHECK-SSE1-NEXT:    xorb %dl, %r11b
; CHECK-SSE1-NEXT:    andb 1(%r12), %r11b
; CHECK-SSE1-NEXT:    xorb %dl, %r11b
; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 2(%r14), %edx
; CHECK-SSE1-NEXT:    xorb %cl, %dl
; CHECK-SSE1-NEXT:    andb 2(%r12), %dl
; CHECK-SSE1-NEXT:    xorb %cl, %dl
; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 3(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 3(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 4(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %sil, %al
; CHECK-SSE1-NEXT:    andb 4(%r12), %al
; CHECK-SSE1-NEXT:    xorb %sil, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 5(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %dil, %al
; CHECK-SSE1-NEXT:    andb 5(%r12), %al
; CHECK-SSE1-NEXT:    xorb %dil, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 6(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %bpl, %al
; CHECK-SSE1-NEXT:    andb 6(%r12), %al
; CHECK-SSE1-NEXT:    xorb %bpl, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 7(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    andb 7(%r12), %al
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 8(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %r9b, %al
; CHECK-SSE1-NEXT:    andb 8(%r12), %al
; CHECK-SSE1-NEXT:    xorb %r9b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 9(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %r8b, %al
; CHECK-SSE1-NEXT:    andb 9(%r12), %al
; CHECK-SSE1-NEXT:    xorb %r8b, %al
; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 10(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 10(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 11(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 11(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 12(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 12(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 13(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 13(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 14(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 14(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 15(%r14), %ecx
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 15(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 16(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 16(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 16(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 17(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 17(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 17(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 18(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 18(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 18(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 19(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 19(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 19(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 20(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 20(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 20(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT:    movzbl 21(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 21(%r14), %ebp
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    andb 21(%r12), %bpl
; CHECK-SSE1-NEXT:    xorb %al, %bpl
; CHECK-SSE1-NEXT:    movzbl 22(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 22(%r14), %ebx
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    andb 22(%r12), %bl
; CHECK-SSE1-NEXT:    xorb %al, %bl
; CHECK-SSE1-NEXT:    movzbl 23(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 23(%r14), %r11d
; CHECK-SSE1-NEXT:    xorb %al, %r11b
; CHECK-SSE1-NEXT:    andb 23(%r12), %r11b
; CHECK-SSE1-NEXT:    xorb %al, %r11b
; CHECK-SSE1-NEXT:    movzbl 24(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 24(%r14), %r9d
; CHECK-SSE1-NEXT:    xorb %al, %r9b
; CHECK-SSE1-NEXT:    andb 24(%r12), %r9b
; CHECK-SSE1-NEXT:    xorb %al, %r9b
; CHECK-SSE1-NEXT:    movzbl 25(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 25(%r14), %r8d
; CHECK-SSE1-NEXT:    xorb %al, %r8b
; CHECK-SSE1-NEXT:    andb 25(%r12), %r8b
; CHECK-SSE1-NEXT:    xorb %al, %r8b
; CHECK-SSE1-NEXT:    movzbl 26(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 26(%r14), %edi
; CHECK-SSE1-NEXT:    xorb %al, %dil
; CHECK-SSE1-NEXT:    andb 26(%r12), %dil
; CHECK-SSE1-NEXT:    xorb %al, %dil
; CHECK-SSE1-NEXT:    movzbl 27(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 27(%r14), %esi
; CHECK-SSE1-NEXT:    xorb %al, %sil
; CHECK-SSE1-NEXT:    andb 27(%r12), %sil
; CHECK-SSE1-NEXT:    xorb %al, %sil
; CHECK-SSE1-NEXT:    movzbl 28(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 28(%r14), %edx
; CHECK-SSE1-NEXT:    xorb %al, %dl
; CHECK-SSE1-NEXT:    andb 28(%r12), %dl
; CHECK-SSE1-NEXT:    xorb %al, %dl
; CHECK-SSE1-NEXT:    movzbl 29(%r15), %eax
; CHECK-SSE1-NEXT:    movzbl 29(%r14), %ecx
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    andb 29(%r12), %cl
; CHECK-SSE1-NEXT:    xorb %al, %cl
; CHECK-SSE1-NEXT:    movzbl 30(%r15), %r10d
; CHECK-SSE1-NEXT:    movzbl 30(%r14), %eax
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    andb 30(%r12), %al
; CHECK-SSE1-NEXT:    xorb %r10b, %al
; CHECK-SSE1-NEXT:    movzbl 31(%r15), %r10d
; CHECK-SSE1-NEXT:    movzbl 31(%r14), %r14d
; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
; CHECK-SSE1-NEXT:    andb 31(%r12), %r14b
; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
; CHECK-SSE1-NEXT:    movb %r14b, 31(%r13)
; CHECK-SSE1-NEXT:    movb %al, 30(%r13)
; CHECK-SSE1-NEXT:    movb %cl, 29(%r13)
; CHECK-SSE1-NEXT:    movb %dl, 28(%r13)
; CHECK-SSE1-NEXT:    movb %sil, 27(%r13)
; CHECK-SSE1-NEXT:    movb %dil, 26(%r13)
; CHECK-SSE1-NEXT:    movb %r8b, 25(%r13)
; CHECK-SSE1-NEXT:    movb %r9b, 24(%r13)
; CHECK-SSE1-NEXT:    movb %r11b, 23(%r13)
; CHECK-SSE1-NEXT:    movb %bl, 22(%r13)
; CHECK-SSE1-NEXT:    movb %bpl, 21(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 20(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 19(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 18(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 17(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 16(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 15(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 14(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 13(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 12(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 11(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 10(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 9(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 8(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 7(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 6(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 5(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 4(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 3(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 2(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, 1(%r13)
; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT:    movb %al, (%r13)
; CHECK-SSE1-NEXT:    movq %r13, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v32i8:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v32i8:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <32 x i8>, ptr%px, align 32
  %y = load <32 x i8>, ptr%py, align 32
  %mask = load <32 x i8>, ptr%pmask, align 32
  %n0 = xor <32 x i8> %x, %y
  %n1 = and <32 x i8> %n0, %mask
  %r = xor <32 x i8> %n1, %y
  ret <32 x i8> %r
}

define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: in_v16i16:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rcx, %r9
; CHECK-BASELINE-NEXT:    movq %rdi, %r10
; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %eax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r8d
; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r11d
; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
; CHECK-BASELINE-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r14d
; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r15d
; CHECK-BASELINE-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r12d
; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r13d
; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edi
; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %eax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
; CHECK-BASELINE-NEXT:    xorw %cx, %dx
; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %ecx
; CHECK-BASELINE-NEXT:    xorw %ax, %cx
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %di, %ax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %ecx
; CHECK-BASELINE-NEXT:    xorw %r13w, %cx
; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
; CHECK-BASELINE-NEXT:    xorw %r14w, %dx
; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r13d
; CHECK-BASELINE-NEXT:    xorw %bp, %r13w
; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r12d
; CHECK-BASELINE-NEXT:    xorw %bx, %r12w
; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r15d
; CHECK-BASELINE-NEXT:    xorw %r11w, %r15w
; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %r14d
; CHECK-BASELINE-NEXT:    xorw %r8w, %r14w
; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebp
; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %ebx
; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r11d
; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %edi
; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
; CHECK-BASELINE-NEXT:    andw 30(%r9), %si
; CHECK-BASELINE-NEXT:    andw 28(%r9), %di
; CHECK-BASELINE-NEXT:    andw 26(%r9), %r11w
; CHECK-BASELINE-NEXT:    andw 24(%r9), %bx
; CHECK-BASELINE-NEXT:    andw 22(%r9), %bp
; CHECK-BASELINE-NEXT:    andw 20(%r9), %r14w
; CHECK-BASELINE-NEXT:    andw 18(%r9), %r15w
; CHECK-BASELINE-NEXT:    andw 16(%r9), %r12w
; CHECK-BASELINE-NEXT:    andw 14(%r9), %r13w
; CHECK-BASELINE-NEXT:    andw 12(%r9), %dx
; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    andw 10(%r9), %ax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
; CHECK-BASELINE-NEXT:    andw 8(%r9), %dx
; CHECK-BASELINE-NEXT:    andw 6(%r9), %cx
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
; CHECK-BASELINE-NEXT:    andw 4(%r9), %r8w
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    andw 2(%r9), %ax
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-BASELINE-NEXT:    andw (%r9), %cx
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movl %edx, %ecx
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    movw %si, 30(%r10)
; CHECK-BASELINE-NEXT:    movw %di, 28(%r10)
; CHECK-BASELINE-NEXT:    movw %r11w, 26(%r10)
; CHECK-BASELINE-NEXT:    movw %bx, 24(%r10)
; CHECK-BASELINE-NEXT:    movw %bp, 22(%r10)
; CHECK-BASELINE-NEXT:    movw %r14w, 20(%r10)
; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r10)
; CHECK-BASELINE-NEXT:    movw %r12w, 16(%r10)
; CHECK-BASELINE-NEXT:    movw %r13w, 14(%r10)
; CHECK-BASELINE-NEXT:    movw %ax, 12(%r10)
; CHECK-BASELINE-NEXT:    movw %dx, 10(%r10)
; CHECK-BASELINE-NEXT:    movw %cx, 8(%r10)
; CHECK-BASELINE-NEXT:    movw %r9w, 6(%r10)
; CHECK-BASELINE-NEXT:    movw %r8w, 4(%r10)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, 2(%r10)
; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-BASELINE-NEXT:    movw %ax, (%r10)
; CHECK-BASELINE-NEXT:    movq %r10, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v16i16:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rcx, %r9
; CHECK-SSE1-NEXT:    movq %rdi, %r10
; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edi
; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %edi
; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %eax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 20(%rdx), %r8d
; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r11d
; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
; CHECK-SSE1-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
; CHECK-SSE1-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 12(%rdx), %r14d
; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r15d
; CHECK-SSE1-NEXT:    movl %r15d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 8(%rdx), %r12d
; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r13d
; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 4(%rdx), %edi
; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %eax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
; CHECK-SSE1-NEXT:    xorw %cx, %dx
; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %ecx
; CHECK-SSE1-NEXT:    xorw %ax, %cx
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %di, %ax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %ecx
; CHECK-SSE1-NEXT:    xorw %r13w, %cx
; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r12w, %ax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
; CHECK-SSE1-NEXT:    xorw %r15w, %ax
; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
; CHECK-SSE1-NEXT:    xorw %r14w, %dx
; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r13d
; CHECK-SSE1-NEXT:    xorw %bp, %r13w
; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r12d
; CHECK-SSE1-NEXT:    xorw %bx, %r12w
; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r15d
; CHECK-SSE1-NEXT:    xorw %r11w, %r15w
; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %r14d
; CHECK-SSE1-NEXT:    xorw %r8w, %r14w
; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebp
; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %ebx
; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r11d
; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %edi
; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
; CHECK-SSE1-NEXT:    andw 30(%r9), %si
; CHECK-SSE1-NEXT:    andw 28(%r9), %di
; CHECK-SSE1-NEXT:    andw 26(%r9), %r11w
; CHECK-SSE1-NEXT:    andw 24(%r9), %bx
; CHECK-SSE1-NEXT:    andw 22(%r9), %bp
; CHECK-SSE1-NEXT:    andw 20(%r9), %r14w
; CHECK-SSE1-NEXT:    andw 18(%r9), %r15w
; CHECK-SSE1-NEXT:    andw 16(%r9), %r12w
; CHECK-SSE1-NEXT:    andw 14(%r9), %r13w
; CHECK-SSE1-NEXT:    andw 12(%r9), %dx
; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    andw 10(%r9), %ax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
; CHECK-SSE1-NEXT:    andw 8(%r9), %dx
; CHECK-SSE1-NEXT:    andw 6(%r9), %cx
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
; CHECK-SSE1-NEXT:    andw 4(%r9), %r8w
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    andw 2(%r9), %ax
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
; CHECK-SSE1-NEXT:    andw (%r9), %cx
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movl %edx, %ecx
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    movw %si, 30(%r10)
; CHECK-SSE1-NEXT:    movw %di, 28(%r10)
; CHECK-SSE1-NEXT:    movw %r11w, 26(%r10)
; CHECK-SSE1-NEXT:    movw %bx, 24(%r10)
; CHECK-SSE1-NEXT:    movw %bp, 22(%r10)
; CHECK-SSE1-NEXT:    movw %r14w, 20(%r10)
; CHECK-SSE1-NEXT:    movw %r15w, 18(%r10)
; CHECK-SSE1-NEXT:    movw %r12w, 16(%r10)
; CHECK-SSE1-NEXT:    movw %r13w, 14(%r10)
; CHECK-SSE1-NEXT:    movw %ax, 12(%r10)
; CHECK-SSE1-NEXT:    movw %dx, 10(%r10)
; CHECK-SSE1-NEXT:    movw %cx, 8(%r10)
; CHECK-SSE1-NEXT:    movw %r9w, 6(%r10)
; CHECK-SSE1-NEXT:    movw %r8w, 4(%r10)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, 2(%r10)
; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
; CHECK-SSE1-NEXT:    movw %ax, (%r10)
; CHECK-SSE1-NEXT:    movq %r10, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v16i16:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v16i16:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <16 x i16>, ptr%px, align 32
  %y = load <16 x i16>, ptr%py, align 32
  %mask = load <16 x i16>, ptr%pmask, align 32
  %n0 = xor <16 x i16> %x, %y
  %n1 = and <16 x i16> %n0, %mask
  %r = xor <16 x i16> %n1, %y
  ret <16 x i16> %r
}

define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: in_v8i32:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbp
; CHECK-BASELINE-NEXT:    pushq %r15
; CHECK-BASELINE-NEXT:    pushq %r14
; CHECK-BASELINE-NEXT:    pushq %r13
; CHECK-BASELINE-NEXT:    pushq %r12
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movl 28(%rdx), %ebp
; CHECK-BASELINE-NEXT:    movl 24(%rdx), %ebx
; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
; CHECK-BASELINE-NEXT:    movl 16(%rdx), %eax
; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r14d
; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-BASELINE-NEXT:    movl (%rdx), %r15d
; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r13d
; CHECK-BASELINE-NEXT:    movl (%rsi), %r8d
; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
; CHECK-BASELINE-NEXT:    xorl %r14d, %r11d
; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r14d
; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
; CHECK-BASELINE-NEXT:    movl 16(%rsi), %r12d
; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edx
; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
; CHECK-BASELINE-NEXT:    andl 24(%rcx), %eax
; CHECK-BASELINE-NEXT:    andl 20(%rcx), %edx
; CHECK-BASELINE-NEXT:    andl 16(%rcx), %r12d
; CHECK-BASELINE-NEXT:    andl 12(%rcx), %r14d
; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r11d
; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
; CHECK-BASELINE-NEXT:    andl (%rcx), %r8d
; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
; CHECK-BASELINE-NEXT:    movl %esi, 28(%rdi)
; CHECK-BASELINE-NEXT:    movl %eax, 24(%rdi)
; CHECK-BASELINE-NEXT:    movl %edx, 20(%rdi)
; CHECK-BASELINE-NEXT:    movl %r12d, 16(%rdi)
; CHECK-BASELINE-NEXT:    movl %r14d, 12(%rdi)
; CHECK-BASELINE-NEXT:    movl %r11d, 8(%rdi)
; CHECK-BASELINE-NEXT:    movl %r9d, 4(%rdi)
; CHECK-BASELINE-NEXT:    movl %r8d, (%rdi)
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    popq %r12
; CHECK-BASELINE-NEXT:    popq %r13
; CHECK-BASELINE-NEXT:    popq %r14
; CHECK-BASELINE-NEXT:    popq %r15
; CHECK-BASELINE-NEXT:    popq %rbp
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v8i32:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbp
; CHECK-SSE1-NEXT:    pushq %r15
; CHECK-SSE1-NEXT:    pushq %r14
; CHECK-SSE1-NEXT:    pushq %r13
; CHECK-SSE1-NEXT:    pushq %r12
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movl 28(%rdx), %ebp
; CHECK-SSE1-NEXT:    movl 24(%rdx), %ebx
; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
; CHECK-SSE1-NEXT:    movl 16(%rdx), %eax
; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl 8(%rdx), %r14d
; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE1-NEXT:    movl (%rdx), %r15d
; CHECK-SSE1-NEXT:    movl 4(%rdx), %r13d
; CHECK-SSE1-NEXT:    movl (%rsi), %r8d
; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
; CHECK-SSE1-NEXT:    movl 8(%rsi), %r11d
; CHECK-SSE1-NEXT:    xorl %r14d, %r11d
; CHECK-SSE1-NEXT:    movl 12(%rsi), %r14d
; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
; CHECK-SSE1-NEXT:    movl 16(%rsi), %r12d
; CHECK-SSE1-NEXT:    xorl %eax, %r12d
; CHECK-SSE1-NEXT:    movl 20(%rsi), %edx
; CHECK-SSE1-NEXT:    xorl %r10d, %edx
; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
; CHECK-SSE1-NEXT:    xorl %ebx, %eax
; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
; CHECK-SSE1-NEXT:    xorl %ebp, %esi
; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
; CHECK-SSE1-NEXT:    andl 24(%rcx), %eax
; CHECK-SSE1-NEXT:    andl 20(%rcx), %edx
; CHECK-SSE1-NEXT:    andl 16(%rcx), %r12d
; CHECK-SSE1-NEXT:    andl 12(%rcx), %r14d
; CHECK-SSE1-NEXT:    andl 8(%rcx), %r11d
; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
; CHECK-SSE1-NEXT:    andl (%rcx), %r8d
; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
; CHECK-SSE1-NEXT:    xorl %r10d, %edx
; CHECK-SSE1-NEXT:    xorl %ebx, %eax
; CHECK-SSE1-NEXT:    xorl %ebp, %esi
; CHECK-SSE1-NEXT:    movl %esi, 28(%rdi)
; CHECK-SSE1-NEXT:    movl %eax, 24(%rdi)
; CHECK-SSE1-NEXT:    movl %edx, 20(%rdi)
; CHECK-SSE1-NEXT:    movl %r12d, 16(%rdi)
; CHECK-SSE1-NEXT:    movl %r14d, 12(%rdi)
; CHECK-SSE1-NEXT:    movl %r11d, 8(%rdi)
; CHECK-SSE1-NEXT:    movl %r9d, 4(%rdi)
; CHECK-SSE1-NEXT:    movl %r8d, (%rdi)
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    popq %r12
; CHECK-SSE1-NEXT:    popq %r13
; CHECK-SSE1-NEXT:    popq %r14
; CHECK-SSE1-NEXT:    popq %r15
; CHECK-SSE1-NEXT:    popq %rbp
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v8i32:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v8i32:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <8 x i32>, ptr%px, align 32
  %y = load <8 x i32>, ptr%py, align 32
  %mask = load <8 x i32>, ptr%pmask, align 32
  %n0 = xor <8 x i32> %x, %y
  %n1 = and <8 x i32> %n0, %mask
  %r = xor <8 x i32> %n1, %y
  ret <8 x i32> %r
}

define <4 x i64> @in_v4i64(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-LABEL: in_v4i64:
; CHECK-BASELINE:       # %bb.0:
; CHECK-BASELINE-NEXT:    pushq %rbx
; CHECK-BASELINE-NEXT:    movq %rdi, %rax
; CHECK-BASELINE-NEXT:    movq 24(%rdx), %rdi
; CHECK-BASELINE-NEXT:    movq 16(%rdx), %r8
; CHECK-BASELINE-NEXT:    movq (%rdx), %r9
; CHECK-BASELINE-NEXT:    movq 8(%rdx), %r10
; CHECK-BASELINE-NEXT:    movq (%rsi), %rdx
; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
; CHECK-BASELINE-NEXT:    movq 8(%rsi), %r11
; CHECK-BASELINE-NEXT:    xorq %r10, %r11
; CHECK-BASELINE-NEXT:    movq 16(%rsi), %rbx
; CHECK-BASELINE-NEXT:    xorq %r8, %rbx
; CHECK-BASELINE-NEXT:    movq 24(%rsi), %rsi
; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
; CHECK-BASELINE-NEXT:    andq 24(%rcx), %rsi
; CHECK-BASELINE-NEXT:    andq 16(%rcx), %rbx
; CHECK-BASELINE-NEXT:    andq 8(%rcx), %r11
; CHECK-BASELINE-NEXT:    andq (%rcx), %rdx
; CHECK-BASELINE-NEXT:    xorq %r9, %rdx
; CHECK-BASELINE-NEXT:    xorq %r10, %r11
; CHECK-BASELINE-NEXT:    xorq %r8, %rbx
; CHECK-BASELINE-NEXT:    xorq %rdi, %rsi
; CHECK-BASELINE-NEXT:    movq %rsi, 24(%rax)
; CHECK-BASELINE-NEXT:    movq %rbx, 16(%rax)
; CHECK-BASELINE-NEXT:    movq %r11, 8(%rax)
; CHECK-BASELINE-NEXT:    movq %rdx, (%rax)
; CHECK-BASELINE-NEXT:    popq %rbx
; CHECK-BASELINE-NEXT:    retq
;
; CHECK-SSE1-LABEL: in_v4i64:
; CHECK-SSE1:       # %bb.0:
; CHECK-SSE1-NEXT:    pushq %rbx
; CHECK-SSE1-NEXT:    movq %rdi, %rax
; CHECK-SSE1-NEXT:    movq 24(%rdx), %rdi
; CHECK-SSE1-NEXT:    movq 16(%rdx), %r8
; CHECK-SSE1-NEXT:    movq (%rdx), %r9
; CHECK-SSE1-NEXT:    movq 8(%rdx), %r10
; CHECK-SSE1-NEXT:    movq (%rsi), %rdx
; CHECK-SSE1-NEXT:    xorq %r9, %rdx
; CHECK-SSE1-NEXT:    movq 8(%rsi), %r11
; CHECK-SSE1-NEXT:    xorq %r10, %r11
; CHECK-SSE1-NEXT:    movq 16(%rsi), %rbx
; CHECK-SSE1-NEXT:    xorq %r8, %rbx
; CHECK-SSE1-NEXT:    movq 24(%rsi), %rsi
; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
; CHECK-SSE1-NEXT:    andq 24(%rcx), %rsi
; CHECK-SSE1-NEXT:    andq 16(%rcx), %rbx
; CHECK-SSE1-NEXT:    andq 8(%rcx), %r11
; CHECK-SSE1-NEXT:    andq (%rcx), %rdx
; CHECK-SSE1-NEXT:    xorq %r9, %rdx
; CHECK-SSE1-NEXT:    xorq %r10, %r11
; CHECK-SSE1-NEXT:    xorq %r8, %rbx
; CHECK-SSE1-NEXT:    xorq %rdi, %rsi
; CHECK-SSE1-NEXT:    movq %rsi, 24(%rax)
; CHECK-SSE1-NEXT:    movq %rbx, 16(%rax)
; CHECK-SSE1-NEXT:    movq %r11, 8(%rax)
; CHECK-SSE1-NEXT:    movq %rdx, (%rax)
; CHECK-SSE1-NEXT:    popq %rbx
; CHECK-SSE1-NEXT:    retq
;
; CHECK-SSE2-LABEL: in_v4i64:
; CHECK-SSE2:       # %bb.0:
; CHECK-SSE2-NEXT:    movaps (%rdx), %xmm0
; CHECK-SSE2-NEXT:    movaps 16(%rdx), %xmm1
; CHECK-SSE2-NEXT:    movaps %xmm0, %xmm2
; CHECK-SSE2-NEXT:    andnps (%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps (%rdi), %xmm0
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm0
; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm2
; CHECK-SSE2-NEXT:    andnps 16(%rsi), %xmm2
; CHECK-SSE2-NEXT:    andps 16(%rdi), %xmm1
; CHECK-SSE2-NEXT:    orps %xmm2, %xmm1
; CHECK-SSE2-NEXT:    retq
;
; CHECK-XOP-LABEL: in_v4i64:
; CHECK-XOP:       # %bb.0:
; CHECK-XOP-NEXT:    vmovdqa (%rdi), %ymm0
; CHECK-XOP-NEXT:    vmovdqa (%rdx), %ymm1
; CHECK-XOP-NEXT:    vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT:    retq
  %x = load <4 x i64>, ptr%px, align 32
  %y = load <4 x i64>, ptr%py, align 32
  %mask = load <4 x i64>, ptr%pmask, align 32
  %n0 = xor <4 x i64> %x, %y
  %n1 = and <4 x i64> %n0, %mask
  %r = xor <4 x i64> %n1, %y
  ret <4 x i64> %r
}