llvm/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s

; Test that we can unfold constant pool loads when we're using avx512's
; ability to fold a broadcast load into an operation.

define void @bcast_unfold_add_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB0_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB0_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <16 x i32>, ptr %tmp3, align 4
  %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <16 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_add_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB1_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB1_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i32>, ptr %tmp3, align 4
  %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <8 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_add_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB2_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB2_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i32>, ptr %tmp3, align 4
  %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
  store <4 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_add_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB3_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB3_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i64>, ptr %tmp3, align 8
  %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  store <8 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_add_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB4_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB4_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i64>, ptr %tmp3, align 8
  %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
  store <4 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_add_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_add_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB5_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB5_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <2 x i64>, ptr %tmp3, align 8
  %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
  store <2 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB6_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm0
; CHECK-NEXT:    vpaddd %zmm0, %zmm0, %zmm1
; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT:    vmovdqu64 %zmm0, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB6_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <16 x i32>, ptr %tmp3, align 4
  %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  store <16 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB7_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %ymm0
; CHECK-NEXT:    vpaddd %ymm0, %ymm0, %ymm1
; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT:    vmovdqu %ymm0, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB7_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i32>, ptr %tmp3, align 4
  %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  store <8 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB8_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm0
; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT:    vmovdqu %xmm0, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB8_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i32>, ptr %tmp3, align 4
  %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
  store <4 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB9_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm0
; CHECK-NEXT:    vpaddq %zmm0, %zmm0, %zmm1
; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT:    vmovdqu64 %zmm0, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB9_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i64>, ptr %tmp3, align 8
  %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
  store <8 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB10_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm0
; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm1
; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT:    vmovdqu %ymm0, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB10_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i64>, ptr %tmp3, align 8
  %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
  store <4 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_mul_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_mul_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB11_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm0
; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm1
; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT:    vmovdqu %xmm0, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB11_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <2 x i64>, ptr %tmp3, align 8
  %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
  store <2 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB12_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpord 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB12_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <16 x i32>, ptr %tmp3, align 4
  %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  store <16 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB13_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vorps 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB13_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i32>, ptr %tmp3, align 4
  %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  store <8 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB14_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vorps 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB14_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i32>, ptr %tmp3, align 4
  %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
  store <4 x i32> %tmp6, ptr %tmp3, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB15_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vporq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB15_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <8 x i64>, ptr %tmp3, align 8
  %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
  store <8 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB16_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vorps 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB16_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <4 x i64>, ptr %tmp3, align 8
  %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
  store <4 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_or_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_or_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [3,3]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB17_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vorps 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB17_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
  %tmp3 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp5 = load <2 x i64>, ptr %tmp3, align 8
  %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
  store <2 x i64> %tmp6, ptr %tmp3, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb2

bb10:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fneg_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB18_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpxord 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB18_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fneg <16 x float> %tmp4
  store <16 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 16
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fneg_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB19_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vxorps 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB19_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fneg <8 x float> %tmp4
  store <8 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fneg_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB20_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vxorps 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB20_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fneg <4 x float> %tmp4
  store <4 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fneg_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB21_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB21_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fneg <8 x double> %tmp4
  store <8 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fneg_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB22_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vxorps 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB22_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fneg <4 x double> %tmp4
  store <4 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fneg_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fneg_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB23_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vxorps 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB23_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fneg <2 x double> %tmp4
  store <2 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 2
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fabs_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB24_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpandd 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB24_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
  store <16 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 16
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0

define void @bcast_unfold_fabs_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB25_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vandps 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB25_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
  store <8 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0

define void @bcast_unfold_fabs_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB26_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vandps 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB26_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
  store <4 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0

define void @bcast_unfold_fabs_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB27_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpandq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB27_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
  store <8 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0

define void @bcast_unfold_fabs_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB28_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vandps 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB28_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
  store <4 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0

define void @bcast_unfold_fabs_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fabs_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [NaN,NaN]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB29_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vandps 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB29_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
  store <2 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 2
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0

define void @bcast_unfold_fadd_v16f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB30_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB30_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <16 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 16
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fadd_v8f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB31_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB31_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <8 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fadd_v4f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB32_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB32_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <4 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fadd_v8f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB33_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB33_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <8 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fadd_v4f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB34_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB34_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <4 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fadd_v2f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fadd_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB35_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB35_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
  store <2 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 2
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v16f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB36_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB36_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <16 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 16
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v8f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB37_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB37_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <8 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v4f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB38_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB38_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <4 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v8f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB39_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB39_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <8 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v4f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB40_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB40_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <4 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmul_v2f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fmul_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB41_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB41_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
  store <2 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 2
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v16f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB42_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB42_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fdiv <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <16 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 16
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v8f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB43_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vdivps %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB43_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fdiv <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <8 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v4f32(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB44_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vdivps %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB44_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fdiv <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <4 x float> %tmp5, ptr %tmp2, align 4
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v8f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB45_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB45_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fdiv <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <8 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 8
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v4f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB46_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vdivpd %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB46_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fdiv <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <4 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 4
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fdiv_v2f64(ptr nocapture %arg) {
; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB47_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB47_1
; CHECK-NEXT:  # %bb.2: # %bb9
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fdiv <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
  store <2 x double> %tmp5, ptr %tmp2, align 8
  %tmp7 = add i64 %tmp, 2
  %tmp8 = icmp eq i64 %tmp7, 1024
  br i1 %tmp8, label %bb9, label %bb1

bb9:                                              ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB48_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB48_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp5 = load <4 x float>, ptr %tmp3, align 4
  %tmp6 = fmul contract <4 x float> %tmp5, %tmp5
  %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <4 x float> %tmp7, ptr %tmp3, align 4
  %tmp9 = add i64 %tmp, 4
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB49_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB49_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = fadd contract <4 x float> %tmp4, %tmp5
  store <4 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB50_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB50_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp5 = load <8 x float>, ptr %tmp3, align 4
  %tmp6 = fmul contract <8 x float> %tmp5, %tmp5
  %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <8 x float> %tmp7, ptr %tmp3, align 4
  %tmp9 = add i64 %tmp, 8
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB51_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB51_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = fadd contract <8 x float> %tmp4, %tmp5
  store <8 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB52_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB52_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp5 = load <16 x float>, ptr %tmp3, align 4
  %tmp6 = fmul contract <16 x float> %tmp5, %tmp5
  %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <16 x float> %tmp7, ptr %tmp3, align 4
  %tmp9 = add i64 %tmp, 16
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB53_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB53_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = fadd contract <16 x float> %tmp4, %tmp5
  store <16 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB54_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB54_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp5 = load <2 x double>, ptr %tmp3, align 4
  %tmp6 = fmul contract <2 x double> %tmp5, %tmp5
  %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
  store <2 x double> %tmp7, ptr %tmp3, align 8
  %tmp9 = add i64 %tmp, 2
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB55_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB55_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
  %tmp6 = fadd contract <2 x double> %tmp4, %tmp5
  store <2 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB56_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB56_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp5 = load <4 x double>, ptr %tmp3, align 8
  %tmp6 = fmul contract <4 x double> %tmp5, %tmp5
  %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <4 x double> %tmp7, ptr %tmp3, align 8
  %tmp9 = add i64 %tmp, 4
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB57_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB57_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = fadd contract <4 x double> %tmp4, %tmp5
  store <4 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fma213_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma213_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB58_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB58_1
; CHECK-NEXT:  # %bb.2: # %bb11
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
  %tmp3 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp5 = load <8 x double>, ptr %tmp3, align 8
  %tmp6 = fmul contract <8 x double> %tmp5, %tmp5
  %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <8 x double> %tmp7, ptr %tmp3, align 8
  %tmp9 = add i64 %tmp, 8
  %tmp10 = icmp eq i64 %tmp9, 1024
  br i1 %tmp10, label %bb11, label %bb2

bb11:                                             ; preds = %bb2
  ret void
}

define void @bcast_unfold_fma231_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fma231_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB59_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB59_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = fadd contract <8 x double> %tmp4, %tmp5
  store <8 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB60_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB60_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <4 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB61_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vmaxps %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB61_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <8 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB62_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vmaxps %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB62_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <16 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB63_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vmaxpd %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB63_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
  store <2 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB64_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vmaxpd %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB64_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <4 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmax_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmax_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB65_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vmaxpd %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB65_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <8 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB66_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vminps %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB66_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <4 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB67_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vminps %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB67_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <8 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB68_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vminps %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB68_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  store <16 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB69_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vminpd %xmm0, %xmm1, %xmm1
; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB69_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
  store <2 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB70_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vminpd %ymm0, %ymm1, %ymm1
; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB70_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <4 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_fmin_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_fmin_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB71_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vminpd %zmm0, %zmm1, %zmm1
; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB71_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  store <8 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB72_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB72_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB73_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB73_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB74_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB74_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB75_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB75_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
  store <2 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB76_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB76_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smin_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smin_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB77_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB77_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  store <8 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB78_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB78_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB79_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB79_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB80_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB80_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB81_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB81_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
  store <2 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB82_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB82_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_smax_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_smax_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB83_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB83_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  store <8 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB84_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB84_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB85_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB85_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB86_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB86_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB87_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB87_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
  store <2 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB88_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB88_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umin_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umin_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB89_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB89_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  store <8 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB90_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB90_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB91_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB91_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB92_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB92_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB93_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB93_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
  store <2 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB94_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB94_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_umax_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_umax_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB95_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB95_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 8
  %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  store <8 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB96_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB96_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB97_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB97_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB98_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB98_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB99_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB99_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
  store <2 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB100_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vpcmpgtq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB100_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpgt_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB101_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB101_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
  store <8 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB102_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB102_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB103_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %ymm1
; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB103_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB104_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vpcmpeqd %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB104_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB105_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm1
; CHECK-NEXT:    vpcmpeqq %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB105_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
  store <2 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB106_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB106_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpeq_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB107_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm1
; CHECK-NEXT:    vpcmpeqq %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB107_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
  store <8 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB108_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %xmm1
; CHECK-NEXT:    vpcmpltd %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $4, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB108_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB109_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %ymm1
; CHECK-NEXT:    vpcmpltd %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $8, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB109_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB110_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,4), %zmm1
; CHECK-NEXT:    vpcmpltd %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB110_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB111_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %xmm1
; CHECK-NEXT:    vpcmpltq %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $2, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB111_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
  store <2 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB112_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %ymm1
; CHECK-NEXT:    vpcmpltq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $4, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB112_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmp_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB113_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,8), %zmm1
; CHECK-NEXT:    vpcmpltq %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $8, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    jg .LBB113_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
  store <8 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp slt i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB114_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %xmm1
; CHECK-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $4, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB114_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v8i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB115_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %ymm1
; CHECK-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $8, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB115_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
  store <8 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v16i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB116_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,4), %zmm1
; CHECK-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,4)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB116_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <16 x i32>, ptr %tmp2, align 4
  %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
  store <16 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v2i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB117_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %xmm1
; CHECK-NEXT:    vpcmpltuq %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} xmm1 {%k1} = [3,3]
; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $2, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB117_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <2 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
  store <2 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB118_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %ymm1
; CHECK-NEXT:    vpcmpltuq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $4, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB118_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_pcmpu_v8i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    xorl %eax, %eax
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB119_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,8), %zmm1
; CHECK-NEXT:    vpcmpltuq %zmm0, %zmm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm1 {%k1} = [3,3,3,3,3,3,3,3]
; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,8)
; CHECK-NEXT:    addq $8, %rax
; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
; CHECK-NEXT:    ja .LBB119_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <8 x i64>, ptr %tmp2, align 4
  %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
  store <8 x i64> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp ult i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v4f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v4f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB120_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm2
; CHECK-NEXT:    vcmpltps %xmm0, %xmm2, %k1
; CHECK-NEXT:    vblendmps %xmm2, %xmm1, %xmm2 {%k1}
; CHECK-NEXT:    vmovups %xmm2, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB120_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <4 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <4 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v8f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v8f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB121_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm2
; CHECK-NEXT:    vcmpltps %ymm0, %ymm2, %k1
; CHECK-NEXT:    vblendmps %ymm2, %ymm1, %ymm2 {%k1}
; CHECK-NEXT:    vmovups %ymm2, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB121_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <8 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <8 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v16f32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v16f32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB122_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm2
; CHECK-NEXT:    vcmpltps %zmm0, %zmm2, %k1
; CHECK-NEXT:    vblendmps %zmm2, %zmm1, %zmm2 {%k1}
; CHECK-NEXT:    vmovups %zmm2, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB122_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds float, ptr %arg, i64 %tmp
  %tmp4 = load <16 x float>, ptr %tmp2, align 4
  %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <16 x float> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 16
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v2f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v2f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
; CHECK-NEXT:    # xmm0 = mem[0,0]
; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
; CHECK-NEXT:    # xmm1 = mem[0,0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB123_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm2
; CHECK-NEXT:    vcmpltpd %xmm0, %xmm2, %k1
; CHECK-NEXT:    vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
; CHECK-NEXT:    vmovupd %xmm2, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB123_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <2 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
  store <2 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 2
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v4f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v4f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB124_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm2
; CHECK-NEXT:    vcmpltpd %ymm0, %ymm2, %k1
; CHECK-NEXT:    vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
; CHECK-NEXT:    vmovupd %ymm2, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB124_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <4 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <4 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v8f64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_cmp_v8f64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB125_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm2
; CHECK-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
; CHECK-NEXT:    vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
; CHECK-NEXT:    vmovupd %zmm2, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB125_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds double, ptr %arg, i64 %tmp
  %tmp4 = load <8 x double>, ptr %tmp2, align 8
  %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  store <8 x double> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 8
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_cmp_v8f32_refold(ptr nocapture %0) {
; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB126_1: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
; CHECK-NEXT:    vblendmps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
; CHECK-NEXT:    vmovups %ymm2, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB126_1
; CHECK-NEXT:  # %bb.2:
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
  br label %2

2:                                                ; preds = %2, %1
  %3 = phi i64 [ 0, %1 ], [ %8, %2 ]
  %4 = getelementptr inbounds float, ptr %0, i64 %3
  %5 = load <8 x float>, ptr %4, align 4
  %6 = fcmp olt <8 x float> %5, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
  %7 = select <8 x i1> %6, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
  store <8 x float> %7, ptr %4, align 4
  %8 = add i64 %3, 8
  %9 = icmp eq i64 %8, 1024
  br i1 %9, label %10, label %2

10:                                               ; preds = %2
  ret void
}

define void @bcast_unfold_ptestm_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB127_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB127_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_ptestnm_v4i32(ptr %arg) {
; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB128_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k1
; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $16, %rax
; CHECK-NEXT:    jne .LBB128_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i32>, ptr %tmp2, align 4
  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
  %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
  store <4 x i32> %tmp6, ptr %tmp2, align 4
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_ptestm_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB129_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB129_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_ptestnm_v4i64(ptr %arg) {
; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB130_1: # %bb1
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k1
; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm1 {%k1} = [3,3,3,3]
; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
; CHECK-NEXT:    addq $32, %rax
; CHECK-NEXT:    jne .LBB130_1
; CHECK-NEXT:  # %bb.2: # %bb10
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb1

bb1:                                              ; preds = %bb1, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
  %tmp2 = getelementptr inbounds i64, ptr %arg, i64 %tmp
  %tmp4 = load <4 x i64>, ptr %tmp2, align 8
  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
  %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
  store <4 x i64> %tmp6, ptr %tmp2, align 8
  %tmp8 = add i64 %tmp, 4
  %tmp9 = icmp eq i64 %tmp8, 1024
  br i1 %tmp9, label %bb10, label %bb1

bb10:                                             ; preds = %bb1
  ret void
}

define void @bcast_unfold_vpternlog_v16i32(ptr %arg, ptr %arg1) {
; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
; CHECK:       # %bb.0: # %bb
; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
; CHECK-NEXT:    .p2align 4, 0x90
; CHECK-NEXT:  .LBB131_1: # %bb2
; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
; CHECK-NEXT:    vmovdqu64 4096(%rsi,%rax), %zmm2
; CHECK-NEXT:    vpmulld %zmm2, %zmm1, %zmm3
; CHECK-NEXT:    vpternlogd $216, %zmm0, %zmm1, %zmm2
; CHECK-NEXT:    vpmulld %zmm3, %zmm2, %zmm1
; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
; CHECK-NEXT:    addq $64, %rax
; CHECK-NEXT:    jne .LBB131_1
; CHECK-NEXT:  # %bb.2: # %bb20
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
bb:
  br label %bb2

bb2:                                              ; preds = %bb2, %bb
  %tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
  %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp
  %tmp5 = load <16 x i32>, ptr %tmp3, align 4
  %tmp6 = getelementptr inbounds i32, ptr %arg1, i64 %tmp
  %tmp11 = load <16 x i32>, ptr %tmp6, align 4
  %tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
  %tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
  %tmp14 = or <16 x i32> %tmp12, %tmp13
  %tmp15 = mul <16 x i32> %tmp14, %tmp5
  %tmp16 = mul <16 x i32> %tmp15, %tmp11
  store <16 x i32> %tmp16, ptr %tmp3, align 4
  %tmp18 = add i64 %tmp, 16
  %tmp19 = icmp eq i64 %tmp18, 1024
  br i1 %tmp19, label %bb20, label %bb2

bb20:                                             ; preds = %bb2
  ret void
}

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }