llvm/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s
; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -loop-idiom-vectorize-bytecmp-vf=64 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8
; RUN: opt -passes='loop(loop-idiom-vectorize),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=predicated -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL
; RUN: opt -passes=loop-idiom-vectorize -mtriple=riscv64-unknown-linux-gnu -loop-idiom-vectorize-style=masked -mattr=+v -S < %s | FileCheck %s --check-prefix=MASKED

define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; CHECK:       mismatch_min_it_check:
; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
; CHECK:       mismatch_mem_check:
; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
; CHECK:       mismatch_vec_loop_preheader:
; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; CHECK:       mismatch_vec_loop:
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK:       mismatch_vec_loop_inc:
; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; CHECK:       mismatch_vec_loop_found:
; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; CHECK-NEXT:    br label [[MISMATCH_END]]
; CHECK:       mismatch_loop_pre:
; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; CHECK:       mismatch_loop:
; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; CHECK:       mismatch_loop_inc:
; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK:       mismatch_end:
; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; CHECK:       byte.compare:
; CHECK-NEXT:    br label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
;
; LMUL8-LABEL: define i32 @compare_bytes_simple(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; LMUL8:       mismatch_min_it_check:
; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
; LMUL8:       mismatch_mem_check:
; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
; LMUL8:       mismatch_vec_loop_preheader:
; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; LMUL8:       mismatch_vec_loop:
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8:       mismatch_vec_loop_inc:
; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; LMUL8:       mismatch_vec_loop_found:
; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LMUL8-NEXT:    br label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_pre:
; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LMUL8:       mismatch_loop:
; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_inc:
; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; LMUL8:       mismatch_end:
; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LMUL8:       byte.compare:
; LMUL8-NEXT:    br label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_simple(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
; LOOP-DEL:       mismatch_mem_check:
; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]]
; LOOP-DEL:       mismatch_vec_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL:       mismatch_vec_loop_inc:
; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
; LOOP-DEL:       mismatch_vec_loop_found:
; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LOOP-DEL-NEXT:    br label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_pre:
; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LOOP-DEL:       mismatch_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_inc:
; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
;
; MASKED-LABEL: define i32 @compare_bytes_simple(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; MASKED-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; MASKED:       mismatch_min_it_check:
; MASKED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; MASKED-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; MASKED-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; MASKED-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]]
; MASKED:       mismatch_mem_check:
; MASKED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; MASKED-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; MASKED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
; MASKED-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
; MASKED-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
; MASKED-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
; MASKED-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
; MASKED-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
; MASKED-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; MASKED-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; MASKED-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; MASKED-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]]
; MASKED:       mismatch_vec_loop_preheader:
; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; MASKED-NEXT:    [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
; MASKED-NEXT:    br label [[MISMATCH_VEC_LOOP:%.*]]
; MASKED:       mismatch_vec_loop:
; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
; MASKED-NEXT:    br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
; MASKED:       mismatch_vec_loop_inc:
; MASKED-NEXT:    [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; MASKED-NEXT:    [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
; MASKED-NEXT:    br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
; MASKED:       mismatch_vec_loop_found:
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; MASKED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; MASKED-NEXT:    [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
; MASKED-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; MASKED-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; MASKED-NEXT:    br label [[MISMATCH_END]]
; MASKED:       mismatch_loop_pre:
; MASKED-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; MASKED:       mismatch_loop:
; MASKED-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; MASKED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; MASKED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
; MASKED-NEXT:    [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
; MASKED-NEXT:    br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; MASKED:       mismatch_loop_inc:
; MASKED-NEXT:    [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1
; MASKED-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; MASKED-NEXT:    br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; MASKED:       mismatch_end:
; MASKED-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; MASKED-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; MASKED:       byte.compare:
; MASKED-NEXT:    br label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    ret i32 [[INC_LCSSA]]
;
entry:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

while.end:
  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
  ret i32 %inc.lcssa
}

define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_signed_wrap(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; CHECK:       mismatch_min_it_check:
; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; CHECK:       mismatch_mem_check:
; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; CHECK:       mismatch_vec_loop_preheader:
; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; CHECK:       mismatch_vec_loop:
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK:       mismatch_vec_loop_inc:
; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; CHECK:       mismatch_vec_loop_found:
; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; CHECK-NEXT:    br label [[MISMATCH_END]]
; CHECK:       mismatch_loop_pre:
; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; CHECK:       mismatch_loop:
; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; CHECK:       mismatch_loop_inc:
; CHECK-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK:       mismatch_end:
; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; CHECK:       byte.compare:
; CHECK-NEXT:    br label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
;
; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; LMUL8:       mismatch_min_it_check:
; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LMUL8:       mismatch_mem_check:
; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; LMUL8:       mismatch_vec_loop_preheader:
; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; LMUL8:       mismatch_vec_loop:
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8:       mismatch_vec_loop_inc:
; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; LMUL8:       mismatch_vec_loop_found:
; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LMUL8-NEXT:    br label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_pre:
; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LMUL8:       mismatch_loop:
; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_inc:
; LMUL8-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; LMUL8:       mismatch_end:
; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LMUL8:       byte.compare:
; LMUL8-NEXT:    br label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LOOP-DEL:       mismatch_mem_check:
; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
; LOOP-DEL:       mismatch_vec_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL:       mismatch_vec_loop_inc:
; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]]
; LOOP-DEL:       mismatch_vec_loop_found:
; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LOOP-DEL-NEXT:    br label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_pre:
; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LOOP-DEL:       mismatch_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_inc:
; LOOP-DEL-NEXT:    [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1
; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT:    ret i32 [[MISMATCH_RESULT]]
;
; MASKED-LABEL: define i32 @compare_bytes_signed_wrap(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; MASKED-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; MASKED:       mismatch_min_it_check:
; MASKED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; MASKED-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; MASKED-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; MASKED-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; MASKED:       mismatch_mem_check:
; MASKED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; MASKED-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; MASKED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
; MASKED-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
; MASKED-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
; MASKED-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
; MASKED-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
; MASKED-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
; MASKED-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; MASKED-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; MASKED-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; MASKED-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; MASKED:       mismatch_vec_loop_preheader:
; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; MASKED-NEXT:    [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
; MASKED-NEXT:    br label [[MISMATCH_VEC_LOOP:%.*]]
; MASKED:       mismatch_vec_loop:
; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
; MASKED-NEXT:    br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
; MASKED:       mismatch_vec_loop_inc:
; MASKED-NEXT:    [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; MASKED-NEXT:    [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
; MASKED-NEXT:    br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
; MASKED:       mismatch_vec_loop_found:
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; MASKED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; MASKED-NEXT:    [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
; MASKED-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; MASKED-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; MASKED-NEXT:    br label [[MISMATCH_END]]
; MASKED:       mismatch_loop_pre:
; MASKED-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; MASKED:       mismatch_loop:
; MASKED-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; MASKED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; MASKED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
; MASKED-NEXT:    [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
; MASKED-NEXT:    br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; MASKED:       mismatch_loop_inc:
; MASKED-NEXT:    [[TMP43]] = add nsw i32 [[MISMATCH_INDEX]], 1
; MASKED-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; MASKED-NEXT:    br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; MASKED:       mismatch_end:
; MASKED-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; MASKED-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; MASKED:       byte.compare:
; MASKED-NEXT:    br label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    ret i32 [[INC_LCSSA]]
;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
; NO-TRANSFORM-NEXT:  entry:
; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
; NO-TRANSFORM:       while.cond:
; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; NO-TRANSFORM-NEXT:    [[INC]] = add nsw i32 [[LEN_ADDR]], 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; NO-TRANSFORM:       while.body:
; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; NO-TRANSFORM:       while.end:
; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
entry:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add nsw i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

while.end:
  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
  ret i32 %inc.lcssa
}


define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; CHECK:       mismatch_min_it_check:
; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; CHECK:       mismatch_mem_check:
; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; CHECK:       mismatch_vec_loop_preheader:
; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; CHECK:       mismatch_vec_loop:
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK:       mismatch_vec_loop_inc:
; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; CHECK:       mismatch_vec_loop_found:
; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; CHECK-NEXT:    br label [[MISMATCH_END]]
; CHECK:       mismatch_loop_pre:
; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; CHECK:       mismatch_loop:
; CHECK-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; CHECK:       mismatch_loop_inc:
; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK:       mismatch_end:
; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
; CHECK:       while.found:
; CHECK-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    br label [[END:%.*]]
; CHECK:       byte.compare:
; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; CHECK-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
; CHECK:       while.end:
; CHECK-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    br label [[END]]
; CHECK:       end:
; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
; CHECK-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
; CHECK-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
; CHECK-NEXT:    ret i32 [[MISMATCH_INDEX]]
;
; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; LMUL8:       mismatch_min_it_check:
; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LMUL8:       mismatch_mem_check:
; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; LMUL8:       mismatch_vec_loop_preheader:
; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; LMUL8:       mismatch_vec_loop:
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8:       mismatch_vec_loop_inc:
; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; LMUL8:       mismatch_vec_loop_found:
; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LMUL8-NEXT:    br label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_pre:
; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LMUL8:       mismatch_loop:
; LMUL8-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_inc:
; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; LMUL8:       mismatch_end:
; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
; LMUL8:       while.found:
; LMUL8-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    br label [[END:%.*]]
; LMUL8:       byte.compare:
; LMUL8-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    br label [[END]]
; LMUL8:       end:
; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
; LMUL8-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
; LMUL8-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
; LMUL8-NEXT:    ret i32 [[MISMATCH_INDEX]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LOOP-DEL:       mismatch_mem_check:
; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
; LOOP-DEL:       mismatch_vec_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL:       mismatch_vec_loop_inc:
; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]]
; LOOP-DEL:       mismatch_vec_loop_found:
; LOOP-DEL-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64
; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]]
; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LOOP-DEL-NEXT:    br label [[BYTE_COMPARE]]
; LOOP-DEL:       mismatch_loop_pre:
; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LOOP-DEL:       mismatch_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]]
; LOOP-DEL:       mismatch_loop_inc:
; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1
; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]]
; LOOP-DEL:       byte.compare:
; LOOP-DEL-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LOOP-DEL-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]]
; LOOP-DEL-NEXT:    [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]]
; LOOP-DEL-NEXT:    store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4
; LOOP-DEL-NEXT:    ret i32 [[SPEC_SELECT]]
;
; MASKED-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; MASKED-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; MASKED:       mismatch_min_it_check:
; MASKED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; MASKED-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; MASKED-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; MASKED-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; MASKED:       mismatch_mem_check:
; MASKED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; MASKED-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; MASKED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
; MASKED-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
; MASKED-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
; MASKED-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
; MASKED-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
; MASKED-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
; MASKED-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; MASKED-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; MASKED-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; MASKED-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; MASKED:       mismatch_vec_loop_preheader:
; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; MASKED-NEXT:    [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
; MASKED-NEXT:    br label [[MISMATCH_VEC_LOOP:%.*]]
; MASKED:       mismatch_vec_loop:
; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
; MASKED-NEXT:    br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
; MASKED:       mismatch_vec_loop_inc:
; MASKED-NEXT:    [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; MASKED-NEXT:    [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
; MASKED-NEXT:    br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
; MASKED:       mismatch_vec_loop_found:
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; MASKED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; MASKED-NEXT:    [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
; MASKED-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; MASKED-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; MASKED-NEXT:    br label [[MISMATCH_END]]
; MASKED:       mismatch_loop_pre:
; MASKED-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; MASKED:       mismatch_loop:
; MASKED-NEXT:    [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64
; MASKED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; MASKED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
; MASKED-NEXT:    [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
; MASKED-NEXT:    br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; MASKED:       mismatch_loop_inc:
; MASKED-NEXT:    [[TMP43]] = add i32 [[MISMATCH_INDEX3]], 1
; MASKED-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; MASKED-NEXT:    br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; MASKED:       mismatch_end:
; MASKED-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; MASKED-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
; MASKED:       while.found:
; MASKED-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    br label [[END:%.*]]
; MASKED:       byte.compare:
; MASKED-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; MASKED-NEXT:    br i1 [[TMP47]], label [[WHILE_END]], label [[WHILE_FOUND]]
; MASKED:       while.end:
; MASKED-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    br label [[END]]
; MASKED:       end:
; MASKED-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
; MASKED-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
; MASKED-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
; MASKED-NEXT:    ret i32 [[MISMATCH_INDEX]]
;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) {
; NO-TRANSFORM-NEXT:  entry:
; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
; NO-TRANSFORM:       while.cond:
; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; NO-TRANSFORM:       while.body:
; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]]
; NO-TRANSFORM:       while.found:
; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
; NO-TRANSFORM-NEXT:    [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ]
; NO-TRANSFORM-NEXT:    br label [[END:%.*]]
; NO-TRANSFORM:       while.end:
; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ]
; NO-TRANSFORM-NEXT:    [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ]
; NO-TRANSFORM-NEXT:    br label [[END]]
; NO-TRANSFORM:       end:
; NO-TRANSFORM-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ]
; NO-TRANSFORM-NEXT:    [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ]
; NO-TRANSFORM-NEXT:    store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4
; NO-TRANSFORM-NEXT:    ret i32 [[MISMATCH_INDEX]]
entry:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.found

while.found:
  %mismatch_index1 = phi i32 [ %inc, %while.body ]
  %found_ptr = phi ptr [ %c, %while.body ]
  br label %end

while.end:
  %mismatch_index2 = phi i32 [ %n, %while.cond ]
  %end_ptr = phi ptr [ %d, %while.cond ]
  br label %end

end:
  %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ]
  %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ]
  store i32 %mismatch_index, ptr %store_ptr
  ret i32 %mismatch_index
}



define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
; CHECK-LABEL: define i32 @compare_bytes_extra_cmp(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
; CHECK-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
; CHECK:       ph:
; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; CHECK:       mismatch_min_it_check:
; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; CHECK-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; CHECK:       mismatch_mem_check:
; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; CHECK-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; CHECK-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; CHECK:       mismatch_vec_loop_preheader:
; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; CHECK:       mismatch_vec_loop:
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; CHECK-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK:       mismatch_vec_loop_inc:
; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; CHECK-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; CHECK-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; CHECK:       mismatch_vec_loop_found:
; CHECK-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
; CHECK-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; CHECK-NEXT:    br label [[MISMATCH_END]]
; CHECK:       mismatch_loop_pre:
; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; CHECK:       mismatch_loop:
; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; CHECK-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; CHECK:       mismatch_loop_inc:
; CHECK-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; CHECK-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK:       mismatch_end:
; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
; CHECK:       byte.compare:
; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT]]
; CHECK:       while.end.loopexit:
; CHECK-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    br label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
;
; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
; LMUL8-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
; LMUL8:       ph:
; LMUL8-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; LMUL8:       mismatch_min_it_check:
; LMUL8-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LMUL8-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LMUL8-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LMUL8:       mismatch_mem_check:
; LMUL8-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LMUL8-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LMUL8-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LMUL8-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LMUL8-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LMUL8-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LMUL8-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LMUL8-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LMUL8-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LMUL8-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LMUL8-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LMUL8-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; LMUL8:       mismatch_vec_loop_preheader:
; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; LMUL8:       mismatch_vec_loop:
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
; LMUL8-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP19]])
; LMUL8-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LMUL8-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8:       mismatch_vec_loop_inc:
; LMUL8-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LMUL8-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LMUL8-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LMUL8-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; LMUL8:       mismatch_vec_loop_found:
; LMUL8-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
; LMUL8-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
; LMUL8-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LMUL8-NEXT:    br label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_pre:
; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LMUL8:       mismatch_loop:
; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LMUL8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LMUL8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LMUL8-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LMUL8-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_inc:
; LMUL8-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; LMUL8-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LMUL8-NEXT:    br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; LMUL8:       mismatch_end:
; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
; LMUL8:       byte.compare:
; LMUL8-NEXT:    br label [[WHILE_END_LOOPEXIT]]
; LMUL8:       while.end.loopexit:
; LMUL8-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    br label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
; LOOP-DEL-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
; LOOP-DEL:       ph:
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; LOOP-DEL-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; LOOP-DEL-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LOOP-DEL:       mismatch_mem_check:
; LOOP-DEL-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; LOOP-DEL-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP5]] to i64
; LOOP-DEL-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; LOOP-DEL-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; LOOP-DEL-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64
; LOOP-DEL-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
; LOOP-DEL-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP7]], 12
; LOOP-DEL-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP11]], 12
; LOOP-DEL-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP8]], 12
; LOOP-DEL-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 12
; LOOP-DEL-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]]
; LOOP-DEL-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]]
; LOOP-DEL-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; LOOP-DEL-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]]
; LOOP-DEL:       mismatch_vec_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ]
; LOOP-DEL-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; LOOP-DEL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]]
; LOOP-DEL-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP19]])
; LOOP-DEL-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[FIRST]], [[TMP19]]
; LOOP-DEL-NEXT:    br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LOOP-DEL:       mismatch_vec_loop_inc:
; LOOP-DEL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP19]] to i64
; LOOP-DEL-NEXT:    [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]]
; LOOP-DEL-NEXT:    [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]]
; LOOP-DEL-NEXT:    br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]]
; LOOP-DEL:       mismatch_vec_loop_found:
; LOOP-DEL-NEXT:    [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LOOP-DEL-NEXT:    [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64
; LOOP-DEL-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]]
; LOOP-DEL-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
; LOOP-DEL-NEXT:    br label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_pre:
; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LOOP-DEL:       mismatch_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LOOP-DEL-NEXT:    [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LOOP-DEL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
; LOOP-DEL-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]]
; LOOP-DEL-NEXT:    [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1
; LOOP-DEL-NEXT:    [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]]
; LOOP-DEL-NEXT:    br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]]
; LOOP-DEL:       mismatch_loop_inc:
; LOOP-DEL-NEXT:    [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1
; LOOP-DEL-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP35]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
;
; MASKED-LABEL: define i32 @compare_bytes_extra_cmp(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
; MASKED-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
; MASKED:       ph:
; MASKED-NEXT:    [[TMP0:%.*]] = add i32 [[LEN]], 1
; MASKED-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; MASKED:       mismatch_min_it_check:
; MASKED-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
; MASKED-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
; MASKED-NEXT:    [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]]
; MASKED-NEXT:    br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; MASKED:       mismatch_mem_check:
; MASKED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
; MASKED-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
; MASKED-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64
; MASKED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
; MASKED-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64
; MASKED-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64
; MASKED-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP7]], 12
; MASKED-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP10]], 12
; MASKED-NEXT:    [[TMP14:%.*]] = lshr i64 [[TMP6]], 12
; MASKED-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP11]], 12
; MASKED-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]]
; MASKED-NEXT:    [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]]
; MASKED-NEXT:    [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]]
; MASKED-NEXT:    br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; MASKED:       mismatch_vec_loop_preheader:
; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; MASKED-NEXT:    [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16
; MASKED-NEXT:    br label [[MISMATCH_VEC_LOOP:%.*]]
; MASKED:       mismatch_vec_loop:
; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
; MASKED-NEXT:    br i1 [[TMP28]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
; MASKED:       mismatch_vec_loop_inc:
; MASKED-NEXT:    [[TMP29]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP21]]
; MASKED-NEXT:    [[TMP30]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]])
; MASKED-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 16 x i1> [[TMP30]], i64 0
; MASKED-NEXT:    br i1 [[TMP31]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
; MASKED:       mismatch_vec_loop_found:
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP27]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[TMP32:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; MASKED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
; MASKED-NEXT:    [[TMP34:%.*]] = zext i32 [[TMP33]] to i64
; MASKED-NEXT:    [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP34]]
; MASKED-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32
; MASKED-NEXT:    br label [[MISMATCH_END]]
; MASKED:       mismatch_loop_pre:
; MASKED-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; MASKED:       mismatch_loop:
; MASKED-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; MASKED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
; MASKED-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]]
; MASKED-NEXT:    [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1
; MASKED-NEXT:    [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]]
; MASKED-NEXT:    br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; MASKED:       mismatch_loop_inc:
; MASKED-NEXT:    [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1
; MASKED-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]]
; MASKED-NEXT:    br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; MASKED:       mismatch_end:
; MASKED-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; MASKED-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
; MASKED:       byte.compare:
; MASKED-NEXT:    br label [[WHILE_END_LOOPEXIT]]
; MASKED:       while.end.loopexit:
; MASKED-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    br label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
; MASKED-NEXT:    ret i32 [[INC_LCSSA]]
;
; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp(
; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) {
; NO-TRANSFORM-NEXT:  entry:
; NO-TRANSFORM-NEXT:    [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]]
; NO-TRANSFORM:       ph:
; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
; NO-TRANSFORM:       while.cond:
; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
; NO-TRANSFORM:       while.body:
; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; NO-TRANSFORM:       while.end:
; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
entry:
  %cmp.x = icmp ult i32 %n, %x
  br i1 %cmp.x, label %ph, label %while.end

ph:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

while.end:
  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
  ret i32 %inc.lcssa
}

define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
; CHECK-LABEL: define void @compare_bytes_cleanup_block(
; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; CHECK:       mismatch_min_it_check:
; CHECK-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; CHECK:       mismatch_mem_check:
; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
; CHECK-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
; CHECK-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
; CHECK-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; CHECK:       mismatch_vec_loop_preheader:
; CHECK-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; CHECK:       mismatch_vec_loop:
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
; CHECK-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
; CHECK-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> [[LHS_LOAD]], <vscale x 16 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
; CHECK-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP15]])
; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[FIRST]], [[TMP15]]
; CHECK-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; CHECK:       mismatch_vec_loop_inc:
; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
; CHECK-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; CHECK:       mismatch_vec_loop_found:
; CHECK-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
; CHECK-NEXT:    br label [[MISMATCH_END]]
; CHECK:       mismatch_loop_pre:
; CHECK-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; CHECK:       mismatch_loop:
; CHECK-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; CHECK-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
; CHECK-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; CHECK:       mismatch_loop_inc:
; CHECK-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
; CHECK-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; CHECK:       mismatch_end:
; CHECK-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; CHECK-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
; CHECK:       byte.compare:
; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; CHECK-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
; CHECK:       cleanup.thread:
; CHECK-NEXT:    ret void
; CHECK:       if.end:
; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; CHECK-NEXT:    ret void
;
; LMUL8-LABEL: define void @compare_bytes_cleanup_block(
; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; LMUL8:       mismatch_min_it_check:
; LMUL8-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; LMUL8:       mismatch_mem_check:
; LMUL8-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
; LMUL8-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
; LMUL8-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP1]] to i64
; LMUL8-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
; LMUL8-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
; LMUL8-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
; LMUL8-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
; LMUL8-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP10]], 12
; LMUL8-NEXT:    [[TMP5:%.*]] = lshr i64 [[TMP4]], 12
; LMUL8-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP9]], 12
; LMUL8-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
; LMUL8-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]]
; LMUL8-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]]
; LMUL8-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
; LMUL8-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; LMUL8:       mismatch_vec_loop_preheader:
; LMUL8-NEXT:    br label [[MISMATCH_VECTOR_LOOP:%.*]]
; LMUL8:       mismatch_vec_loop:
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true)
; LMUL8-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[LHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
; LMUL8-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]]
; LMUL8-NEXT:    [[RHS_LOAD:%.*]] = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
; LMUL8-NEXT:    [[MISMATCH_CMP:%.*]] = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> [[LHS_LOAD]], <vscale x 64 x i8> [[RHS_LOAD]], metadata !"ne", <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
; LMUL8-NEXT:    [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1(<vscale x 64 x i1> [[MISMATCH_CMP]], i1 false, <vscale x 64 x i1> shufflevector (<vscale x 64 x i1> insertelement (<vscale x 64 x i1> poison, i1 true, i64 0), <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer), i32 [[TMP15]])
; LMUL8-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[FIRST]], [[TMP15]]
; LMUL8-NEXT:    br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]]
; LMUL8:       mismatch_vec_loop_inc:
; LMUL8-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP15]] to i64
; LMUL8-NEXT:    [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]]
; LMUL8-NEXT:    [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0
; LMUL8-NEXT:    br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]]
; LMUL8:       mismatch_vec_loop_found:
; LMUL8-NEXT:    [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ]
; LMUL8-NEXT:    [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64
; LMUL8-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]]
; LMUL8-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
; LMUL8-NEXT:    br label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_pre:
; LMUL8-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LMUL8:       mismatch_loop:
; LMUL8-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; LMUL8-NEXT:    [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LMUL8-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]]
; LMUL8-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
; LMUL8-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]]
; LMUL8-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
; LMUL8-NEXT:    [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]]
; LMUL8-NEXT:    br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; LMUL8:       mismatch_loop_inc:
; LMUL8-NEXT:    [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1
; LMUL8-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP31]], 0
; LMUL8-NEXT:    br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; LMUL8:       mismatch_end:
; LMUL8-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ]
; LMUL8-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
; LMUL8-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
; LMUL8:       byte.compare:
; LMUL8-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; LMUL8-NEXT:    br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]]
; LMUL8:       cleanup.thread:
; LMUL8-NEXT:    ret void
; LMUL8:       if.end:
; LMUL8-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; LMUL8-NEXT:    ret void
;
; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block(
; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; LOOP-DEL:       mismatch_loop:
; LOOP-DEL-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ]
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]]
; LOOP-DEL-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1
; LOOP-DEL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]]
; LOOP-DEL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
; LOOP-DEL-NEXT:    [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]]
; LOOP-DEL-NEXT:    [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1
; LOOP-DEL-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
; LOOP-DEL-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]]
; LOOP-DEL-NEXT:    br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]]
; LOOP-DEL:       common.ret:
; LOOP-DEL-NEXT:    ret void
;
; MASKED-LABEL: define void @compare_bytes_cleanup_block(
; MASKED-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    br label [[MISMATCH_MIN_IT_CHECK:%.*]]
; MASKED:       mismatch_min_it_check:
; MASKED-NEXT:    br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]]
; MASKED:       mismatch_mem_check:
; MASKED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1
; MASKED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1
; MASKED-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
; MASKED-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP0]] to i64
; MASKED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0
; MASKED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0
; MASKED-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64
; MASKED-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64
; MASKED-NEXT:    [[TMP8:%.*]] = lshr i64 [[TMP3]], 12
; MASKED-NEXT:    [[TMP9:%.*]] = lshr i64 [[TMP6]], 12
; MASKED-NEXT:    [[TMP10:%.*]] = lshr i64 [[TMP2]], 12
; MASKED-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP7]], 12
; MASKED-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]]
; MASKED-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
; MASKED-NEXT:    [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]]
; MASKED-NEXT:    br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VEC_LOOP_PREHEADER:%.*]], !prof [[PROF1]]
; MASKED:       mismatch_vec_loop_preheader:
; MASKED-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0)
; MASKED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; MASKED-NEXT:    [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16
; MASKED-NEXT:    br label [[MISMATCH_VEC_LOOP:%.*]]
; MASKED:       mismatch_vec_loop:
; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
; MASKED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
; MASKED-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
; MASKED-NEXT:    [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
; MASKED-NEXT:    [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
; MASKED-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
; MASKED-NEXT:    br i1 [[TMP24]], label [[MISMATCH_VEC_LOOP_FOUND:%.*]], label [[MISMATCH_VEC_LOOP_INC]]
; MASKED:       mismatch_vec_loop_inc:
; MASKED-NEXT:    [[TMP25]] = add nuw nsw i64 [[MISMATCH_VEC_INDEX]], [[TMP17]]
; MASKED-NEXT:    [[TMP26]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0)
; MASKED-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 16 x i1> [[TMP26]], i64 0
; MASKED-NEXT:    br i1 [[TMP27]], label [[MISMATCH_VEC_LOOP]], label [[MISMATCH_END:%.*]]
; MASKED:       mismatch_vec_loop_found:
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP23]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_LAST_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[MISMATCH_VEC_LOOP_PRED]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[MISMATCH_VEC_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_VEC_INDEX]], [[MISMATCH_VEC_LOOP]] ]
; MASKED-NEXT:    [[TMP28:%.*]] = and <vscale x 16 x i1> [[MISMATCH_VEC_LAST_LOOP_PRED]], [[MISMATCH_VEC_FOUND_PRED]]
; MASKED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> [[TMP28]], i1 true)
; MASKED-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP29]] to i64
; MASKED-NEXT:    [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_VEC_FOUND_INDEX]], [[TMP30]]
; MASKED-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
; MASKED-NEXT:    br label [[MISMATCH_END]]
; MASKED:       mismatch_loop_pre:
; MASKED-NEXT:    br label [[MISMATCH_LOOP:%.*]]
; MASKED:       mismatch_loop:
; MASKED-NEXT:    [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP39:%.*]], [[MISMATCH_LOOP_INC:%.*]] ]
; MASKED-NEXT:    [[TMP33:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64
; MASKED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP33]]
; MASKED-NEXT:    [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
; MASKED-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP33]]
; MASKED-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
; MASKED-NEXT:    [[TMP38:%.*]] = icmp eq i8 [[TMP35]], [[TMP37]]
; MASKED-NEXT:    br i1 [[TMP38]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]]
; MASKED:       mismatch_loop_inc:
; MASKED-NEXT:    [[TMP39]] = add i32 [[MISMATCH_INDEX]], 1
; MASKED-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 0
; MASKED-NEXT:    br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]]
; MASKED:       mismatch_end:
; MASKED-NEXT:    [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VEC_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_VEC_LOOP_FOUND]] ]
; MASKED-NEXT:    br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ]
; MASKED-NEXT:    [[INC:%.*]] = add i32 [[LEN]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
; MASKED:       byte.compare:
; MASKED-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0
; MASKED-NEXT:    br i1 [[TMP43]], label [[CLEANUP_THREAD]], label [[IF_END]]
; MASKED:       cleanup.thread:
; MASKED-NEXT:    ret void
; MASKED:       if.end:
; MASKED-NEXT:    [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
; MASKED-NEXT:    ret void
;
; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block(
; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
; NO-TRANSFORM-NEXT:  entry:
; NO-TRANSFORM-NEXT:    br label [[WHILE_COND:%.*]]
; NO-TRANSFORM:       while.cond:
; NO-TRANSFORM-NEXT:    [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN]], 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]]
; NO-TRANSFORM:       while.body:
; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]]
; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]]
; NO-TRANSFORM:       cleanup.thread:
; NO-TRANSFORM-NEXT:    ret void
; NO-TRANSFORM:       if.end:
; NO-TRANSFORM-NEXT:    [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ]
; NO-TRANSFORM-NEXT:    ret void
entry:
  br label %while.cond

while.cond:
  %len = phi i32 [ %inc, %while.body ], [ 0, %entry ]
  %inc = add i32 %len, 1
  %cmp.not = icmp eq i32 %inc, 0
  br i1 %cmp.not, label %cleanup.thread, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom
  %0 = load i8, ptr %arrayidx, align 1
  %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom
  %1 = load i8, ptr %arrayidx2, align 1
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %if.end

cleanup.thread:
  ret void

if.end:
  %res = phi i32 [ %inc, %while.body ]
  ret void
}

;
; NEGATIVE TESTS
;

; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI
; with unique values for each incoming block from the loop.
define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple2(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; CHECK-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
; CHECK-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
;
; LMUL8-LABEL: define i32 @compare_bytes_simple2(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LMUL8-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
; LMUL8-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
; LOOP-DEL:       while.cond:
; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LOOP-DEL:       while.body:
; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
;
; MASKED-LABEL: define i32 @compare_bytes_simple2(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    br label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; MASKED-NEXT:    [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ]
; MASKED-NEXT:    store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4
; MASKED-NEXT:    ret i32 [[INC_LCSSA]]
;
entry:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

while.end:
  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
  %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
  store i32 %inc.lcssa, ptr %final_ptr
  ret i32 %inc.lcssa
}

define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) {
; CHECK-LABEL: define i32 @compare_bytes_simple3(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; CHECK-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
; CHECK-NEXT:    ret i32 [[FINAL_VAL]]
;
; LMUL8-LABEL: define i32 @compare_bytes_simple3(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LMUL8-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
; LMUL8-NEXT:    ret i32 [[FINAL_VAL]]
;
; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
; LOOP-DEL:       while.cond:
; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LOOP-DEL:       while.body:
; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
; LOOP-DEL-NEXT:    ret i32 [[FINAL_VAL]]
;
; MASKED-LABEL: define i32 @compare_bytes_simple3(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    br label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; MASKED-NEXT:    store i32 [[FINAL_VAL]], ptr [[C]], align 4
; MASKED-NEXT:    ret i32 [[FINAL_VAL]]
;
  entry:
  br label %while.cond

  while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

  while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

  while.end:
  %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ]
  store i32 %final_val, ptr %c
  ret i32 %final_val
}

; Disable the optimization when noimplicitfloat is present.
define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat {
; CHECK-LABEL: define i32 @no_implicit_float(
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
; CHECK:       while.cond:
; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; CHECK-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; CHECK:       while.body:
; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; CHECK:       while.end:
; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
;
; LMUL8-LABEL: define i32 @no_implicit_float(
; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; LMUL8-NEXT:  entry:
; LMUL8-NEXT:    br label [[WHILE_COND:%.*]]
; LMUL8:       while.cond:
; LMUL8-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LMUL8-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LMUL8-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LMUL8-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LMUL8:       while.body:
; LMUL8-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LMUL8-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LMUL8-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LMUL8-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LMUL8-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LMUL8-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LMUL8:       while.end:
; LMUL8-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LMUL8-NEXT:    ret i32 [[INC_LCSSA]]
;
; LOOP-DEL-LABEL: define i32 @no_implicit_float(
; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; LOOP-DEL-NEXT:  entry:
; LOOP-DEL-NEXT:    br label [[WHILE_COND:%.*]]
; LOOP-DEL:       while.cond:
; LOOP-DEL-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; LOOP-DEL-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; LOOP-DEL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; LOOP-DEL:       while.body:
; LOOP-DEL-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; LOOP-DEL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; LOOP-DEL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; LOOP-DEL-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; LOOP-DEL-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; LOOP-DEL-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; LOOP-DEL:       while.end:
; LOOP-DEL-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; LOOP-DEL-NEXT:    ret i32 [[INC_LCSSA]]
;
; MASKED-LABEL: define i32 @no_implicit_float(
; MASKED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; MASKED-NEXT:  entry:
; MASKED-NEXT:    br label [[WHILE_COND:%.*]]
; MASKED:       while.cond:
; MASKED-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
; MASKED-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
; MASKED-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
; MASKED-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]]
; MASKED:       while.body:
; MASKED-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
; MASKED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; MASKED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
; MASKED-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
; MASKED-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
; MASKED-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
; MASKED:       while.end:
; MASKED-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ]
; MASKED-NEXT:    ret i32 [[INC_LCSSA]]
;
entry:
  br label %while.cond

while.cond:
  %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ]
  %inc = add i32 %len.addr, 1
  %cmp.not = icmp eq i32 %inc, %n
  br i1 %cmp.not, label %while.end, label %while.body

while.body:
  %idxprom = zext i32 %inc to i64
  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
  %0 = load i8, ptr %arrayidx
  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
  %1 = load i8, ptr %arrayidx2
  %cmp.not2 = icmp eq i8 %0, %1
  br i1 %cmp.not2, label %while.cond, label %while.end

while.end:
  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ]
  ret i32 %inc.lcssa
}