llvm/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=loop-vectorize -S -o - | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -o - | FileCheck --check-prefix=MAX-BW %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; This test checks that the given loop still beneficial for vecotization
; even if it contains scalarized load (gather on AVX2)

; Function Attrs: norecurse nounwind readonly uwtable
define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
; CHECK-LABEL: @matrix_row_col(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK:       vector.ph:
; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
; CHECK:       vector.body:
; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
; CHECK-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP8]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP9]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP10]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP11]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP12]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP13]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP14]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP15]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP16]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP17]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP18]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP19]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP20]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP21]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP22]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP23]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP24]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP25]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP26]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP27]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP28]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
; CHECK-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
; CHECK-NEXT:    [[TMP131:%.*]] = insertelement <8 x i32> [[TMP130]], i32 [[TMP123]], i32 3
; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <8 x i32> [[TMP131]], i32 [[TMP124]], i32 4
; CHECK-NEXT:    [[TMP133:%.*]] = insertelement <8 x i32> [[TMP132]], i32 [[TMP125]], i32 5
; CHECK-NEXT:    [[TMP134:%.*]] = insertelement <8 x i32> [[TMP133]], i32 [[TMP126]], i32 6
; CHECK-NEXT:    [[TMP135:%.*]] = insertelement <8 x i32> [[TMP134]], i32 [[TMP127]], i32 7
; CHECK-NEXT:    [[TMP136:%.*]] = mul nsw <8 x i32> [[TMP87]], [[WIDE_LOAD]]
; CHECK-NEXT:    [[TMP137:%.*]] = mul nsw <8 x i32> [[TMP103]], [[WIDE_LOAD4]]
; CHECK-NEXT:    [[TMP138:%.*]] = mul nsw <8 x i32> [[TMP119]], [[WIDE_LOAD5]]
; CHECK-NEXT:    [[TMP139:%.*]] = mul nsw <8 x i32> [[TMP135]], [[WIDE_LOAD6]]
; CHECK-NEXT:    [[TMP140:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT:    [[TMP141:%.*]] = add <8 x i32> [[VEC_PHI1]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT:    [[TMP142:%.*]] = add <8 x i32> [[VEC_PHI2]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT:    [[TMP143:%.*]] = add <8 x i32> [[VEC_PHI3]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT:    [[TMP144]] = add <8 x i32> [[TMP140]], [[TMP136]]
; CHECK-NEXT:    [[TMP145]] = add <8 x i32> [[TMP141]], [[TMP137]]
; CHECK-NEXT:    [[TMP146]] = add <8 x i32> [[TMP142]], [[TMP138]]
; CHECK-NEXT:    [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CHECK-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
; CHECK-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK:       middle.block:
; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK:       scalar.ph:
; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
; CHECK:       for.cond.cleanup:
; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
; CHECK:       for.body:
; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
; CHECK-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
; CHECK-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
; MAX-BW-LABEL: @matrix_row_col(
; MAX-BW-NEXT:  entry:
; MAX-BW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64
; MAX-BW-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J:%.*]] to i64
; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; MAX-BW:       vector.ph:
; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
; MAX-BW:       vector.body:
; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
; MAX-BW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
; MAX-BW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
; MAX-BW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
; MAX-BW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
; MAX-BW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
; MAX-BW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
; MAX-BW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
; MAX-BW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
; MAX-BW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
; MAX-BW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
; MAX-BW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
; MAX-BW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
; MAX-BW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
; MAX-BW-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
; MAX-BW-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
; MAX-BW-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
; MAX-BW-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
; MAX-BW-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
; MAX-BW-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
; MAX-BW-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
; MAX-BW-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
; MAX-BW-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
; MAX-BW-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
; MAX-BW-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
; MAX-BW-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
; MAX-BW-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
; MAX-BW-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
; MAX-BW-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
; MAX-BW-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA:%.*]], i64 [[IDXPROM]], i64 [[TMP0]]
; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 8
; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 16
; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 24
; MAX-BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP36]], align 4, !tbaa [[TBAA1:![0-9]+]]
; MAX-BW-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, ptr [[TMP37]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, ptr [[TMP39]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP0]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP1]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP2]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP3]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP4]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP5]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP6]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP7]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP8]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP9]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP10]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP11]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP12]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP13]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP14]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP15]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP16]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP17]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP18]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP19]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP20]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP21]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP22]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP23]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP24]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP65:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP25]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP26]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP27]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP28]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP29]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP30]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[TMP31]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP40]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP41]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP42]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP43]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP44]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP45]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP46]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP79:%.*]] = load i32, ptr [[TMP47]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP80:%.*]] = insertelement <8 x i32> poison, i32 [[TMP72]], i32 0
; MAX-BW-NEXT:    [[TMP81:%.*]] = insertelement <8 x i32> [[TMP80]], i32 [[TMP73]], i32 1
; MAX-BW-NEXT:    [[TMP82:%.*]] = insertelement <8 x i32> [[TMP81]], i32 [[TMP74]], i32 2
; MAX-BW-NEXT:    [[TMP83:%.*]] = insertelement <8 x i32> [[TMP82]], i32 [[TMP75]], i32 3
; MAX-BW-NEXT:    [[TMP84:%.*]] = insertelement <8 x i32> [[TMP83]], i32 [[TMP76]], i32 4
; MAX-BW-NEXT:    [[TMP85:%.*]] = insertelement <8 x i32> [[TMP84]], i32 [[TMP77]], i32 5
; MAX-BW-NEXT:    [[TMP86:%.*]] = insertelement <8 x i32> [[TMP85]], i32 [[TMP78]], i32 6
; MAX-BW-NEXT:    [[TMP87:%.*]] = insertelement <8 x i32> [[TMP86]], i32 [[TMP79]], i32 7
; MAX-BW-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP48]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP49]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP50]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP51]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP92:%.*]] = load i32, ptr [[TMP52]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP53]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP94:%.*]] = load i32, ptr [[TMP54]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP55]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP96:%.*]] = insertelement <8 x i32> poison, i32 [[TMP88]], i32 0
; MAX-BW-NEXT:    [[TMP97:%.*]] = insertelement <8 x i32> [[TMP96]], i32 [[TMP89]], i32 1
; MAX-BW-NEXT:    [[TMP98:%.*]] = insertelement <8 x i32> [[TMP97]], i32 [[TMP90]], i32 2
; MAX-BW-NEXT:    [[TMP99:%.*]] = insertelement <8 x i32> [[TMP98]], i32 [[TMP91]], i32 3
; MAX-BW-NEXT:    [[TMP100:%.*]] = insertelement <8 x i32> [[TMP99]], i32 [[TMP92]], i32 4
; MAX-BW-NEXT:    [[TMP101:%.*]] = insertelement <8 x i32> [[TMP100]], i32 [[TMP93]], i32 5
; MAX-BW-NEXT:    [[TMP102:%.*]] = insertelement <8 x i32> [[TMP101]], i32 [[TMP94]], i32 6
; MAX-BW-NEXT:    [[TMP103:%.*]] = insertelement <8 x i32> [[TMP102]], i32 [[TMP95]], i32 7
; MAX-BW-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP56]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP57]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP58]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP59]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP60]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP61]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP110:%.*]] = load i32, ptr [[TMP62]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP63]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP112:%.*]] = insertelement <8 x i32> poison, i32 [[TMP104]], i32 0
; MAX-BW-NEXT:    [[TMP113:%.*]] = insertelement <8 x i32> [[TMP112]], i32 [[TMP105]], i32 1
; MAX-BW-NEXT:    [[TMP114:%.*]] = insertelement <8 x i32> [[TMP113]], i32 [[TMP106]], i32 2
; MAX-BW-NEXT:    [[TMP115:%.*]] = insertelement <8 x i32> [[TMP114]], i32 [[TMP107]], i32 3
; MAX-BW-NEXT:    [[TMP116:%.*]] = insertelement <8 x i32> [[TMP115]], i32 [[TMP108]], i32 4
; MAX-BW-NEXT:    [[TMP117:%.*]] = insertelement <8 x i32> [[TMP116]], i32 [[TMP109]], i32 5
; MAX-BW-NEXT:    [[TMP118:%.*]] = insertelement <8 x i32> [[TMP117]], i32 [[TMP110]], i32 6
; MAX-BW-NEXT:    [[TMP119:%.*]] = insertelement <8 x i32> [[TMP118]], i32 [[TMP111]], i32 7
; MAX-BW-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP64]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP65]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP66]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP67]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP68]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP125:%.*]] = load i32, ptr [[TMP69]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP70]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP127:%.*]] = load i32, ptr [[TMP71]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[TMP128:%.*]] = insertelement <8 x i32> poison, i32 [[TMP120]], i32 0
; MAX-BW-NEXT:    [[TMP129:%.*]] = insertelement <8 x i32> [[TMP128]], i32 [[TMP121]], i32 1
; MAX-BW-NEXT:    [[TMP130:%.*]] = insertelement <8 x i32> [[TMP129]], i32 [[TMP122]], i32 2
; MAX-BW-NEXT:    [[TMP131:%.*]] = insertelement <8 x i32> [[TMP130]], i32 [[TMP123]], i32 3
; MAX-BW-NEXT:    [[TMP132:%.*]] = insertelement <8 x i32> [[TMP131]], i32 [[TMP124]], i32 4
; MAX-BW-NEXT:    [[TMP133:%.*]] = insertelement <8 x i32> [[TMP132]], i32 [[TMP125]], i32 5
; MAX-BW-NEXT:    [[TMP134:%.*]] = insertelement <8 x i32> [[TMP133]], i32 [[TMP126]], i32 6
; MAX-BW-NEXT:    [[TMP135:%.*]] = insertelement <8 x i32> [[TMP134]], i32 [[TMP127]], i32 7
; MAX-BW-NEXT:    [[TMP136:%.*]] = mul nsw <8 x i32> [[TMP87]], [[WIDE_LOAD]]
; MAX-BW-NEXT:    [[TMP137:%.*]] = mul nsw <8 x i32> [[TMP103]], [[WIDE_LOAD4]]
; MAX-BW-NEXT:    [[TMP138:%.*]] = mul nsw <8 x i32> [[TMP119]], [[WIDE_LOAD5]]
; MAX-BW-NEXT:    [[TMP139:%.*]] = mul nsw <8 x i32> [[TMP135]], [[WIDE_LOAD6]]
; MAX-BW-NEXT:    [[TMP140:%.*]] = add <8 x i32> [[VEC_PHI]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; MAX-BW-NEXT:    [[TMP141:%.*]] = add <8 x i32> [[VEC_PHI1]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; MAX-BW-NEXT:    [[TMP142:%.*]] = add <8 x i32> [[VEC_PHI2]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; MAX-BW-NEXT:    [[TMP143:%.*]] = add <8 x i32> [[VEC_PHI3]], <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
; MAX-BW-NEXT:    [[TMP144]] = add <8 x i32> [[TMP140]], [[TMP136]]
; MAX-BW-NEXT:    [[TMP145]] = add <8 x i32> [[TMP141]], [[TMP137]]
; MAX-BW-NEXT:    [[TMP146]] = add <8 x i32> [[TMP142]], [[TMP138]]
; MAX-BW-NEXT:    [[TMP147]] = add <8 x i32> [[TMP143]], [[TMP139]]
; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; MAX-BW-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
; MAX-BW-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; MAX-BW:       middle.block:
; MAX-BW-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP145]], [[TMP144]]
; MAX-BW-NEXT:    [[BIN_RDX7:%.*]] = add <8 x i32> [[TMP146]], [[BIN_RDX]]
; MAX-BW-NEXT:    [[BIN_RDX8:%.*]] = add <8 x i32> [[TMP147]], [[BIN_RDX7]]
; MAX-BW-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
; MAX-BW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; MAX-BW:       scalar.ph:
; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
; MAX-BW:       for.cond.cleanup:
; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
; MAX-BW-NEXT:    ret i32 [[ADD7_LCSSA]]
; MAX-BW:       for.body:
; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
; MAX-BW-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
; MAX-BW-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
; MAX-BW-NEXT:    [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA1]]
; MAX-BW-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP151]], [[TMP150]]
; MAX-BW-NEXT:    [[ADD:%.*]] = add i32 [[SUM_015]], 4
; MAX-BW-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
; MAX-BW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; MAX-BW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
; MAX-BW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
;
entry:
  %idxprom = sext i32 %i to i64
  %idxprom5 = sext i32 %j to i64
  br label %for.body

  for.cond.cleanup:                                 ; preds = %for.body
  ret i32 %add7

  for.body:                                         ; preds = %for.body, %entry
  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
  ; first consecutive load as vector load
  %arrayidx2 = getelementptr inbounds [100 x i32], ptr %data, i64 %idxprom, i64 %indvars.iv
  %0 = load i32, ptr %arrayidx2, align 4, !tbaa !1
  ; second strided load scalarized
  %arrayidx6 = getelementptr inbounds [100 x i32], ptr %data, i64 %indvars.iv, i64 %idxprom5
  %1 = load i32, ptr %arrayidx6, align 4, !tbaa !1
  %mul = mul nsw i32 %1, %0
  %add = add i32 %sum.015, 4
  %add7 = add i32 %add, %mul
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp eq i64 %indvars.iv.next, 100
  br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

define void @test(ptr %A, ptr noalias %B) #0 {
; CHECK-LABEL: @test(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK:       vector.ph:
; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
; CHECK:       vector.body:
; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP0]], 0
; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP1]], 0
; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP2]], 0
; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP3]], 0
; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[TMP4]], 0
; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP5]], 0
; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 0
; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP7]], 0
; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP8]]
; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP17]], align 4
; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP8]]
; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP11]]
; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP12]]
; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK:       middle.block:
; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK:       scalar.ph:
; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
; CHECK:       for.body:
; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
; CHECK-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
; CHECK-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
; CHECK-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
; CHECK-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK:       for.cond.cleanup:
; CHECK-NEXT:    ret void
;
; MAX-BW-LABEL: @test(
; MAX-BW-NEXT:  entry:
; MAX-BW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; MAX-BW:       vector.ph:
; MAX-BW-NEXT:    br label [[VECTOR_BODY:%.*]]
; MAX-BW:       vector.body:
; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; MAX-BW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
; MAX-BW-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
; MAX-BW-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
; MAX-BW-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
; MAX-BW-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
; MAX-BW-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
; MAX-BW-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 16
; MAX-BW-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 18
; MAX-BW-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 20
; MAX-BW-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 22
; MAX-BW-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 24
; MAX-BW-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 26
; MAX-BW-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 28
; MAX-BW-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 30
; MAX-BW-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 0
; MAX-BW-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 0
; MAX-BW-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[TMP2]], 0
; MAX-BW-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[TMP3]], 0
; MAX-BW-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP4]], 0
; MAX-BW-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP5]], 0
; MAX-BW-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP6]], 0
; MAX-BW-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[TMP7]], 0
; MAX-BW-NEXT:    [[TMP24:%.*]] = add nuw nsw i64 [[TMP8]], 0
; MAX-BW-NEXT:    [[TMP25:%.*]] = add nuw nsw i64 [[TMP9]], 0
; MAX-BW-NEXT:    [[TMP26:%.*]] = add nuw nsw i64 [[TMP10]], 0
; MAX-BW-NEXT:    [[TMP27:%.*]] = add nuw nsw i64 [[TMP11]], 0
; MAX-BW-NEXT:    [[TMP28:%.*]] = add nuw nsw i64 [[TMP12]], 0
; MAX-BW-NEXT:    [[TMP29:%.*]] = add nuw nsw i64 [[TMP13]], 0
; MAX-BW-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[TMP14]], 0
; MAX-BW-NEXT:    [[TMP31:%.*]] = add nuw nsw i64 [[TMP15]], 0
; MAX-BW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A:%.*]], i64 0, i64 [[TMP16]]
; MAX-BW-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 0
; MAX-BW-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i32>, ptr [[TMP33]], align 4
; MAX-BW-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B:%.*]], i64 0, i64 [[TMP16]]
; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP36]], align 1
; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP37]], align 1
; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP38]], align 1
; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP39]], align 1
; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP40]], align 1
; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP41]], align 1
; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP42]], align 1
; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP43]], align 1
; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP44]], align 1
; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP45]], align 1
; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP46]], align 1
; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP47]], align 1
; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP48]], align 1
; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP49]], align 1
; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP50]], align 1
; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; MAX-BW-NEXT:    br i1 [[TMP68]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; MAX-BW:       middle.block:
; MAX-BW-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; MAX-BW:       scalar.ph:
; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-BW-NEXT:    br label [[FOR_BODY:%.*]]
; MAX-BW:       for.body:
; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; MAX-BW-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
; MAX-BW-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
; MAX-BW-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
; MAX-BW-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
; MAX-BW-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
; MAX-BW-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
; MAX-BW-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
; MAX-BW-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
; MAX-BW-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
; MAX-BW-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
; MAX-BW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
; MAX-BW-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
; MAX-BW-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
; MAX-BW:       for.cond.cleanup:
; MAX-BW-NEXT:    ret void
;
entry:
  br label %for.body

for.body:
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]

  %iv.0 = add nuw nsw i64 %iv, 0
  %iv.1 = add nuw nsw i64 %iv, 1

  %in0 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.0
  %in1 = getelementptr inbounds [1024 x i32], ptr %A, i64 0, i64 %iv.1

  %v0 = load i32, ptr %in0
  %v1 = load i32, ptr %in1

  %reduce.add.0 = add i32 %v0, %v1

  %reduce.add.0.narrow = trunc i32 %reduce.add.0 to i8

  %out = getelementptr inbounds [1024 x i8], ptr %B, i64 0, i64 %iv.0
  store i8 %reduce.add.0.narrow, ptr %out

  %iv.next = add nuw nsw i64 %iv.0, 2
  %cmp = icmp ult i64 %iv.next, 1024
  br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup:
  ret void
}

attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }

!llvm.ident = !{!0}

!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"int", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}