llvm/llvm/test/Transforms/VectorCombine/AArch64/shrink-types.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=vector-combine -S %s | FileCheck %s

target triple = "aarch64"

define i32 @test_and(<16 x i32> %a, ptr %b) {
; CHECK-LABEL: @test_and(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A:%.*]] to <16 x i8>
; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[WIDE_LOAD]], [[TMP0]]
; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
; CHECK-NEXT:    ret i32 [[TMP3]]
;
entry:
  %wide.load = load <16 x i8>, ptr %b, align 1
  %0 = zext <16 x i8> %wide.load to <16 x i32>
  %1 = and <16 x i32> %0, %a
  %2 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
  ret i32 %2
}

define i32 @test_mask_or(<16 x i32> %a, ptr %b) {
; CHECK-LABEL: @test_mask_or(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
; CHECK-NEXT:    [[A_MASKED:%.*]] = and <16 x i32> [[A:%.*]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A_MASKED]] to <16 x i8>
; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i8> [[WIDE_LOAD]], [[TMP0]]
; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
; CHECK-NEXT:    ret i32 [[TMP3]]
;
entry:
  %wide.load = load <16 x i8>, ptr %b, align 1
  %a.masked = and <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %0 = zext <16 x i8> %wide.load to <16 x i32>
  %1 = or <16 x i32> %0, %a.masked
  %2 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
  ret i32 %2
}

define i32 @multiuse(<16 x i32> %u, <16 x i32> %v, ptr %b) {
; CHECK-LABEL: @multiuse(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[U_MASKED:%.*]] = and <16 x i32> [[U:%.*]], <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; CHECK-NEXT:    [[V_MASKED:%.*]] = and <16 x i32> [[V:%.*]], <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
; CHECK-NEXT:    [[TMP0:%.*]] = lshr <16 x i8> [[WIDE_LOAD]], <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
; CHECK-NEXT:    [[TMP1:%.*]] = trunc <16 x i32> [[V_MASKED]] to <16 x i8>
; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i8> [[TMP0]], [[TMP1]]
; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32>
; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i8> [[WIDE_LOAD]], <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[U_MASKED]] to <16 x i8>
; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
; CHECK-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[TMP6]] to <16 x i32>
; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <16 x i32> [[TMP3]], [[TMP7]]
; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
; CHECK-NEXT:    ret i32 [[TMP9]]
;
entry:
  %u.masked = and <16 x i32> %u, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
  %v.masked = and <16 x i32> %v, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
  %wide.load = load <16 x i8>, ptr %b, align 1
  %0 = zext <16 x i8> %wide.load to <16 x i32>
  %1 = lshr <16 x i32> %0, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
  %2 = or <16 x i32> %1, %v.masked
  %3 = and <16 x i32> %0, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %4 = or <16 x i32> %3, %u.masked
  %5 = add nuw nsw <16 x i32> %2, %4
  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
  ret i32 %6
}

define i32 @phi_bug(<16 x i32> %a, ptr %b) {
; CHECK-LABEL: @phi_bug(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[B:%.*]], align 1
; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
; CHECK:       vector.body:
; CHECK-NEXT:    [[A_PHI:%.*]] = phi <16 x i32> [ [[A:%.*]], [[ENTRY:%.*]] ]
; CHECK-NEXT:    [[WIDE_LOAD_PHI:%.*]] = phi <16 x i8> [ [[WIDE_LOAD]], [[ENTRY]] ]
; CHECK-NEXT:    [[TMP0:%.*]] = trunc <16 x i32> [[A_PHI]] to <16 x i8>
; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[WIDE_LOAD_PHI]], [[TMP0]]
; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i32>
; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
; CHECK-NEXT:    ret i32 [[TMP3]]
;
entry:
  %wide.load = load <16 x i8>, ptr %b, align 1
  br label %vector.body

vector.body:
  %a.phi = phi <16 x i32> [ %a, %entry ]
  %wide.load.phi = phi <16 x i8> [ %wide.load, %entry ]
  %0 = zext <16 x i8> %wide.load.phi to <16 x i32>
  %1 = and <16 x i32> %0, %a.phi
  %2 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
  ret i32 %2
}

define <2 x i32> @pr108698(<2 x i64> %x, <2 x i32> %y) {
; CHECK-LABEL: @pr108698(
; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i64> [[X:%.*]], zeroinitializer
; CHECK-NEXT:    [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
; CHECK-NEXT:    [[LSHR:%.*]] = lshr <2 x i32> [[EXT]], [[Y:%.*]]
; CHECK-NEXT:    ret <2 x i32> [[LSHR]]
;
  %cmp = icmp eq <2 x i64> %x, zeroinitializer
  %ext = zext <2 x i1> %cmp to <2 x i32>
  %lshr = lshr <2 x i32> %ext, %y
  ret <2 x i32> %lshr
}

declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)