; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=loop-vectorize -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
; RUN: opt < %s -passes=loop-vectorize -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
; RUN: opt < %s -passes=loop-vectorize -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc_linux"
; The source code:
;
;void foo1(int *A, int *B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 24
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
; AVX2-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 8
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 24
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <8 x i1> [[TMP8]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <8 x i1> [[TMP9]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP10]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP11]])
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP27]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo1(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 32
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 48
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
; AVX512-NEXT: [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
; AVX512-NEXT: [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 32
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 48
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP22]], i32 4, <16 x i1> [[TMP8]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <16 x i1> [[TMP9]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <16 x i1> [[TMP10]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP11]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP29]], align 4
; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 0
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
; AVX512-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[TMP34]], i32 0
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP35]], i32 4, <8 x i1> [[TMP30]])
; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP37]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP38]], [[TMP37]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
%1 = load i32, ptr %arrayidx3, align 4
%add = add nsw i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
store i32 %add, ptr %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The same as @foo1 but all the pointers are address space 1 pointers.
define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo1_addrspace1(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP6]], i32 0
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP9]], i32 0
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP10]], i32 4, <8 x i1> [[TMP5]])
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP12]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo1_addrspace1(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 8
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 24
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP4]], align 4
; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP5]], align 4
; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP6]], align 4
; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP7]], align 4
; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 8
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 24
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
; AVX2-NEXT: [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 8
; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 24
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <8 x i1> [[TMP8]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP9]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP10]])
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]])
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP27:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP27]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP28]], [[TMP27]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo1_addrspace1(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr addrspace(1) [[B:%.*]] to i64
; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr addrspace(1) [[TRIGGER:%.*]] to i64
; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr addrspace(1) [[A:%.*]] to i64
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 0
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 16
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 32
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 48
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP4]], align 4
; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP5]], align 4
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP6]], align 4
; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP7]], align 4
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 0
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 32
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 48
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
; AVX512-NEXT: [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
; AVX512-NEXT: [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
; AVX512-NEXT: [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 0
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 32
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 48
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP22]], i32 4, <16 x i1> [[TMP8]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP9]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP10]])
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP11]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP27:%.*]] = add i64 [[INDEX11]], 0
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP28]], i32 0
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP29]], align 4
; AVX512-NEXT: [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP31]], i32 0
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP32]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
; AVX512-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[TMP27]]
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP34]], i32 0
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP35]], i32 4, <8 x i1> [[TMP30]])
; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
; AVX512-NEXT: br i1 [[TMP36]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP37]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP38]], [[TMP37]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store i32 [[ADD]], ptr addrspace(1) [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %trigger, i64 %indvars.iv
%0 = load i32, ptr addrspace(1) %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %B, i64 %indvars.iv
%1 = load i32, ptr addrspace(1) %arrayidx3, align 4
%add = add nsw i32 %1, %0
%arrayidx7 = getelementptr inbounds i32, ptr addrspace(1) %A, i64 %indvars.iv
store i32 %add, ptr addrspace(1) %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo2(ptr A, ptr B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo2(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX1-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX1-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX1: vector.memcheck:
; AVX1-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX1-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
; AVX1-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX1-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
; AVX1-NEXT: [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP7:%.*]] = getelementptr float, ptr [[TMP6]], i32 0
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP11]], i32 4, <8 x i1> [[TMP5]])
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP13]] to float
; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], [[CONV]]
; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo2(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX2-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX2-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 128
; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX2-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 128
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 24
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4
; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 8
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 24
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x float> poison)
; AVX2-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
; AVX2-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
; AVX2-NEXT: [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
; AVX2-NEXT: [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
; AVX2-NEXT: [[TMP21:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP17]]
; AVX2-NEXT: [[TMP22:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP18]]
; AVX2-NEXT: [[TMP23:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
; AVX2-NEXT: [[TMP24:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
; AVX2-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 8
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 24
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP8]])
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP9]])
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP10]])
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]])
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX2-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX2-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP31]], 100
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP32:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP31]] to float
; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP32]], [[CONV]]
; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo2(
; AVX512-NEXT: iter.check:
; AVX512-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
; AVX512-NEXT: [[TRIGGER2:%.*]] = ptrtoint ptr [[TRIGGER:%.*]] to i64
; AVX512-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[TRIGGER2]]
; AVX512-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 256
; AVX512-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B3]]
; AVX512-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 256
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; AVX512: vector.main.loop.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 16
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 32
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 48
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4
; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, ptr [[TMP6]], align 4
; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP7]], align 4
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP13:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 32
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 48
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x float> poison)
; AVX512-NEXT: [[TMP17:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
; AVX512-NEXT: [[TMP18:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
; AVX512-NEXT: [[TMP19:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
; AVX512-NEXT: [[TMP20:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
; AVX512-NEXT: [[TMP21:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP17]]
; AVX512-NEXT: [[TMP22:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP18]]
; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP19]]
; AVX512-NEXT: [[TMP24:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP20]]
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP2]]
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, ptr [[TMP25]], i32 0
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 32
; AVX512-NEXT: [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 48
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP26]], i32 4, <16 x i1> [[TMP8]])
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr [[TMP27]], i32 4, <16 x i1> [[TMP9]])
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr [[TMP28]], i32 4, <16 x i1> [[TMP10]])
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr [[TMP29]], i32 4, <16 x i1> [[TMP11]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
; AVX512-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; AVX512: vec.epilog.iter.check:
; AVX512-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; AVX512: vec.epilog.ph:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
; AVX512-NEXT: [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP31:%.*]] = add i64 [[INDEX11]], 0
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP31]]
; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP33]], align 4
; AVX512-NEXT: [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[TMP31]]
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, ptr [[TMP35]], i32 0
; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP36]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
; AVX512-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
; AVX512-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP31]]
; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, ptr [[TMP39]], i32 0
; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP40]], i32 4, <8 x i1> [[TMP34]])
; AVX512-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
; AVX512-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
; AVX512-NEXT: br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
; AVX512: vec.epilog.scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP42:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP42]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP43:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP42]] to float
; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP43]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], ptr [[ARRAYIDX7]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds float, ptr %B, i64 %indvars.iv
%1 = load float, ptr %arrayidx3, align 4
%conv = sitofp i32 %0 to float
%add = fadd float %1, %conv
%arrayidx7 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
store float %add, ptr %arrayidx7, align 4
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo3(ptr A, ptr B, int *trigger) {
;
; for (int i=0; i<10000; i++) {
; if (trigger[i] < 100) {
; A[i] = B[i] + trigger[i];
; }
; }
;}
define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX-LABEL: @foo3(
; AVX-NEXT: entry:
; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX: vector.memcheck:
; AVX-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
; AVX-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
; AVX-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
; AVX-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; AVX-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; AVX-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
; AVX-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
; AVX-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; AVX-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; AVX-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX: vector.ph:
; AVX-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX: vector.body:
; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
; AVX-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 4
; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 12
; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META8:![0-9]+]]
; AVX-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META8]]
; AVX-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META8]]
; AVX-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META8]]
; AVX-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD8]], <i32 100, i32 100, i32 100, i32 100>
; AVX-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
; AVX-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
; AVX-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
; AVX-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
; AVX-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP11]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
; AVX-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
; AVX-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
; AVX-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
; AVX-NEXT: [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
; AVX-NEXT: [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
; AVX-NEXT: [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
; AVX-NEXT: [[TMP18:%.*]] = sitofp <4 x i32> [[WIDE_LOAD8]] to <4 x double>
; AVX-NEXT: [[TMP19:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP15]]
; AVX-NEXT: [[TMP20:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]]
; AVX-NEXT: [[TMP21:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
; AVX-NEXT: [[TMP22:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
; AVX-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
; AVX-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP24]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; AVX-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; AVX: middle.block:
; AVX-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX: scalar.ph:
; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX: if.then:
; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]]
; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
; AVX-NEXT: br label [[FOR_INC]]
; AVX: for.inc:
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; AVX: for.end:
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo3(
; AVX512-NEXT: entry:
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 80000
; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 40000
; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 80000
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 8
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 16
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 24
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !alias.scope [[META11:![0-9]+]]
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META11]]
; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4, !alias.scope [[META11]]
; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META11]]
; AVX512-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD8]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP10:%.*]] = getelementptr double, ptr [[B]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP10]], i32 0
; AVX512-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
; AVX512-NEXT: [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 16
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP11]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META14:![0-9]+]]
; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP12]], i32 8, <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META14]]
; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META14]]
; AVX512-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META14]]
; AVX512-NEXT: [[TMP15:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
; AVX512-NEXT: [[TMP16:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
; AVX512-NEXT: [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double>
; AVX512-NEXT: [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD8]] to <8 x double>
; AVX512-NEXT: [[TMP19:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP15]]
; AVX512-NEXT: [[TMP20:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD9]], [[TMP16]]
; AVX512-NEXT: [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD10]], [[TMP17]]
; AVX512-NEXT: [[TMP22:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD11]], [[TMP18]]
; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP23]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 16
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 24
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP24]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr [[TMP25]], i32 8, <8 x i1> [[TMP7]]), !alias.scope [[META16]], !noalias [[META18]]
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr [[TMP26]], i32 8, <8 x i1> [[TMP8]]), !alias.scope [[META16]], !noalias [[META18]]
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr [[TMP27]], i32 8, <8 x i1> [[TMP9]]), !alias.scope [[META16]], !noalias [[META18]]
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
; AVX512-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP30:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds double, ptr %B, i64 %indvars.iv
%1 = load double, ptr %arrayidx3, align 8
%conv = sitofp i32 %0 to double
%add = fadd double %1, %conv
%arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
store double %add, ptr %arrayidx7, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, 10000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; The source code:
;
;void foo4(ptr A, ptr B, int *trigger) {
;
; for (int i=0; i<10000; i += 16) {
; if (trigger[i] < 100) {
; A[i] = B[i*2] + trigger[i]; << non-cosecutive access
; }
; }
;}
define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX-LABEL: @foo4(
; AVX-NEXT: entry:
; AVX-NEXT: br label [[FOR_BODY:%.*]]
; AVX: for.body:
; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
; AVX-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX: if.then:
; AVX-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP1]]
; AVX-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
; AVX-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
; AVX-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]]
; AVX-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
; AVX-NEXT: br label [[FOR_INC]]
; AVX: for.inc:
; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; AVX-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; AVX: for.end:
; AVX-NEXT: ret void
;
; AVX512-LABEL: @foo4(
; AVX512-NEXT: entry:
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison), !alias.scope [[META21:![0-9]+]]
; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META24:![0-9]+]]
; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META26:![0-9]+]], !noalias [[META28:![0-9]+]]
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP8]], 100
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP9]]
; AVX512-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP8]] to double
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP10]], [[CONV]]
; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX7]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %entry, %for.inc
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp slt i32 %0, 100
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%1 = shl nuw nsw i64 %indvars.iv, 1
%arrayidx3 = getelementptr inbounds double, ptr %B, i64 %1
%2 = load double, ptr %arrayidx3, align 8
%conv = sitofp i32 %0 to double
%add = fadd double %2, %conv
%arrayidx7 = getelementptr inbounds double, ptr %A, i64 %indvars.iv
store double %add, ptr %arrayidx7, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
%cmp = icmp ult i64 %indvars.iv.next, 10000
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.inc
ret void
}
@a = common global [1 x ptr] zeroinitializer, align 8
@c = common global ptr null, align 8
; Reverse loop
;void foo6(ptr in, ptr out, unsigned size, int *trigger) {
;
; for (int i=SIZE-1; i>=0; i--) {
; if (trigger[i] > 0) {
; out[i] = in[i] + (double) 0.5;
; }
; }
;}
define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr nocapture readonly %trigger) local_unnamed_addr #0 {
; AVX1-LABEL: @foo6(
; AVX1-NEXT: entry:
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 4095, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e-01
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX1-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX1-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo6(
; AVX2-NEXT: entry:
; AVX2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX2: vector.memcheck:
; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
; AVX2-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
; AVX2-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
; AVX2-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
; AVX2-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX2-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
; AVX2-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
; AVX2-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -3
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -4
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -3
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -8
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -3
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -12
; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -3
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META18:![0-9]+]]
; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META18]]
; AVX2-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META18]]
; AVX2-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META18]]
; AVX2-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD10]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE7]], zeroinitializer
; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[REVERSE9]], zeroinitializer
; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i32> [[REVERSE11]], zeroinitializer
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP15]], i32 -3
; AVX2-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i32 -4
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP17]], i32 -3
; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP14]], i32 -8
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP19]], i32 -3
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -12
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -3
; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META21:![0-9]+]]
; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META21]]
; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META21]]
; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META21]]
; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP27]], i32 0
; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3
; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP27]], i32 -4
; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[TMP27]], i32 -8
; AVX2-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP32]], i32 -3
; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -12
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -3
; AVX2-NEXT: [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META23:![0-9]+]], !noalias [[META25:![0-9]+]]
; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META23]], !noalias [[META25]]
; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META23]], !noalias [[META25]]
; AVX2-NEXT: [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META23]], !noalias [[META25]]
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX2-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP37]], 0
; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP38:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP38]], 5.000000e-01
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo6(
; AVX512-NEXT: entry:
; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
; AVX512: vector.memcheck:
; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 32768
; AVX512-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 16384
; AVX512-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32768
; AVX512-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
; AVX512-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; AVX512-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP2]]
; AVX512-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
; AVX512-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -7
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -8
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -7
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -16
; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -7
; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 -24
; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -7
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4, !alias.scope [[META31:![0-9]+]]
; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4, !alias.scope [[META31]]
; AVX512-NEXT: [[REVERSE7:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD6]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META31]]
; AVX512-NEXT: [[REVERSE9:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD8]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META31]]
; AVX512-NEXT: [[REVERSE11:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD10]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP10:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE7]], zeroinitializer
; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <8 x i32> [[REVERSE9]], zeroinitializer
; AVX512-NEXT: [[TMP13:%.*]] = icmp sgt <8 x i32> [[REVERSE11]], zeroinitializer
; AVX512-NEXT: [[TMP14:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP14]], i32 0
; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, ptr [[TMP15]], i32 -7
; AVX512-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP14]], i32 -8
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[TMP17]], i32 -7
; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP14]], i32 -16
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[TMP19]], i32 -7
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7
; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META34:![0-9]+]]
; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META34]]
; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META34]]
; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META34]]
; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP25:%.*]] = fadd <8 x double> [[REVERSE19]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[REVERSE22]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP27]], i32 0
; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -7
; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP27]], i32 -8
; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, ptr [[TMP27]], i32 -16
; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, ptr [[TMP32]], i32 -7
; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7
; AVX512-NEXT: [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META36:![0-9]+]], !noalias [[META38:![0-9]+]]
; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META36]], !noalias [[META38]]
; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META36]], !noalias [[META38]]
; AVX512-NEXT: [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META36]], !noalias [[META38]]
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP37]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP38:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP38]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double [[ADD]], ptr [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
br label %for.body
for.body: ; preds = %for.inc, %entry
%indvars.iv = phi i64 [ 4095, %entry ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%cmp1 = icmp sgt i32 %0, 0
br i1 %cmp1, label %if.then, label %for.inc
if.then: ; preds = %for.body
%arrayidx3 = getelementptr inbounds double, ptr %in, i64 %indvars.iv
%1 = load double, ptr %arrayidx3, align 8
%add = fadd double %1, 5.000000e-01
%arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
store double %add, ptr %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %for.body, %if.then
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%cmp = icmp eq i64 %indvars.iv, 0
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.inc
ret void
}
; void foo7 (ptr __restrict__ out, ptr __restrict__ in,
; bool * __restrict__ trigger, unsigned size) {
;
; for (unsigned i=0; i<size; i++)
; if (trigger[i] && (in[i] != 0))
; out[i] = (double) 0.5;
; }
define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
; AVX1-LABEL: @foo7(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX1: for.body.preheader:
; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
; AVX1-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX1: land.lhs.true:
; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; AVX1: for.end.loopexit:
; AVX1-NEXT: br label [[FOR_END]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo7(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX2: for.body.preheader:
; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
; AVX2-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX2: land.lhs.true:
; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
; AVX2: for.end.loopexit:
; AVX2-NEXT: br label [[FOR_END]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo7(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX512: for.body.preheader:
; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer
; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer
; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer
; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer
; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
%cmp5 = icmp eq i32 %size, 0
br i1 %cmp5, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %size to i64
br label %for.body
for.body: ; preds = %for.inc, %for.body.preheader
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
%0 = load i8, ptr %arrayidx, align 1
%1 = and i8 %0, 1
%tobool = icmp eq i8 %1, 0
br i1 %tobool, label %for.inc, label %land.lhs.true
land.lhs.true: ; preds = %for.body
%arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
%2 = load ptr, ptr %arrayidx2, align 8
%cmp3 = icmp eq ptr %2, null
br i1 %cmp3, label %for.inc, label %if.then
if.then: ; preds = %land.lhs.true
%arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
store double 5.000000e-01, ptr %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %land.lhs.true, %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc, %entry
ret void
}
;typedef int (*fp)();
;void foo8 (ptr __restrict__ out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
;
; for (unsigned i=0; i<size; i++)
; if (trigger[i] && (in[i] != 0))
; out[i] = (double) 0.5;
;}
define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in, ptr noalias nocapture readonly %trigger, i32 %size) local_unnamed_addr #0 {
; AVX1-LABEL: @foo8(
; AVX1-NEXT: entry:
; AVX1-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX1: for.body.preheader:
; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX1: vector.ph:
; AVX1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX1: vector.body:
; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
; AVX1-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
; AVX1-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX1-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX1-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX1-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX1: scalar.ph:
; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX1-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX1-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX1-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX1: land.lhs.true:
; AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX1-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX1-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX1: if.then:
; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX1-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX1-NEXT: br label [[FOR_INC]]
; AVX1: for.inc:
; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; AVX1: for.end.loopexit:
; AVX1-NEXT: br label [[FOR_END]]
; AVX1: for.end:
; AVX1-NEXT: ret void
;
; AVX2-LABEL: @foo8(
; AVX2-NEXT: entry:
; AVX2-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX2: for.body.preheader:
; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX2: vector.ph:
; AVX2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; AVX2-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX2: vector.body:
; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 12
; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
; AVX2-NEXT: [[TMP6:%.*]] = and <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1>
; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer
; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer
; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer
; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer
; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
; AVX2-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX2-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP34:%.*]] = select <4 x i1> [[TMP17]], <4 x i1> [[TMP30]], <4 x i1> zeroinitializer
; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX2-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <4 x i1> [[TMP31]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX2: scalar.ph:
; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX2-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX2-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX2-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX2: land.lhs.true:
; AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX2-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX2-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX2: if.then:
; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX2-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX2-NEXT: br label [[FOR_INC]]
; AVX2: for.inc:
; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; AVX2: for.end.loopexit:
; AVX2-NEXT: br label [[FOR_END]]
; AVX2: for.end:
; AVX2-NEXT: ret void
;
; AVX512-LABEL: @foo8(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; AVX512: for.body.preheader:
; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; AVX512: vector.ph:
; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; AVX512-NEXT: br label [[VECTOR_BODY:%.*]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 8
; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 24
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP3]], align 1
; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP4]], align 1
; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
; AVX512-NEXT: [[TMP6:%.*]] = and <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer
; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer
; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer
; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer
; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0
; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
; AVX512-NEXT: [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
; AVX512-NEXT: [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP19]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP34:%.*]] = select <8 x i1> [[TMP17]], <8 x i1> [[TMP30]], <8 x i1> zeroinitializer
; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[TMP0]]
; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, ptr [[TMP35]], i32 0
; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
; AVX512-NEXT: [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP36]], i32 8, <8 x i1> [[TMP31]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX512: scalar.ph:
; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; AVX512-NEXT: br label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1
; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0
; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
; AVX512: land.lhs.true:
; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[IN]], i64 [[INDVARS_IV]]
; AVX512-NEXT: [[TMP43:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; AVX512-NEXT: [[CMP3:%.*]] = icmp eq ptr [[TMP43]], null
; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; AVX512: if.then:
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store double 5.000000e-01, ptr [[ARRAYIDX5]], align 8
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: for.inc:
; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
; AVX512: for.end.loopexit:
; AVX512-NEXT: br label [[FOR_END]]
; AVX512: for.end:
; AVX512-NEXT: ret void
;
entry:
%cmp5 = icmp eq i32 %size, 0
br i1 %cmp5, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
%wide.trip.count = zext i32 %size to i64
br label %for.body
for.body: ; preds = %for.inc, %for.body.preheader
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
%arrayidx = getelementptr inbounds i8, ptr %trigger, i64 %indvars.iv
%0 = load i8, ptr %arrayidx, align 1
%1 = and i8 %0, 1
%tobool = icmp eq i8 %1, 0
br i1 %tobool, label %for.inc, label %land.lhs.true
land.lhs.true: ; preds = %for.body
%arrayidx2 = getelementptr inbounds ptr, ptr %in, i64 %indvars.iv
%2 = load ptr, ptr %arrayidx2, align 8
%cmp3 = icmp eq ptr %2, null
br i1 %cmp3, label %for.inc, label %if.then
if.then: ; preds = %land.lhs.true
%arrayidx5 = getelementptr inbounds double, ptr %out, i64 %indvars.iv
store double 5.000000e-01, ptr %arrayidx5, align 8
br label %for.inc
for.inc: ; preds = %land.lhs.true, %for.body, %if.then
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.inc, %entry
ret void
}
attributes #0 = { norecurse nounwind }