llvm/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -passes=instcombine -S %s | llc -mtriple=aarch64--linux-gnu -mattr=+sve -o - | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

;
; Immediate Compares
;

define i32 @cmple_imm_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
; CHECK-LABEL: cmple_imm_nxv16i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmple p0.b, p0/z, z0.b, #0
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> %a)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %2, <vscale x 16 x i1> %1)
  %conv = zext i1 %3 to i32
  ret i32 %conv
}

;
; Wide Compares
;

define i32 @cmple_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: cmple_wide_nxv16i8:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmple p0.b, p0/z, z0.b, z1.d
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmple.wide.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %1)
  %conv = zext i1 %2 to i32
  ret i32 %conv
}

define i32 @cmple_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: cmple_wide_nxv8i16:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmple p0.h, p0/z, z0.h, z1.d
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.cmple.wide.nxv8i16(<vscale x 8 x i1> %1, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %2)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %3)
  %conv = zext i1 %4 to i32
  ret i32 %conv
}

define i32 @cmple_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: cmple_wide_nxv4i32:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmple p0.s, p0/z, z0.s, z1.d
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmple.wide.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %3)
  %conv = zext i1 %4 to i32
  ret i32 %conv
}

; ==============================================================================
; PTEST_OP(PG, CMP(PG, ...))
; ==============================================================================

;
; PTEST_FIRST(PG, CMP8(PG, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_first_px(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_first_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; PTEST_LAST(PG, CMP8(PG, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_last_px(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_last_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; PTEST_ANY(PG, CMP8(PG, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_any_px(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_any_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; Same as above except PG = incorrectly sized PTRUE
;
define i1 @cmp8_ptest_any_px_bad_ptrue(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_any_px_bad_ptrue:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.s
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %2, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %2, <vscale x 16 x i1> %3)
  ret i1 %4
}

;
; PTEST_FIRST(PG, CMP32(PG, A, B)). Can't remove PTEST since PTEST.B vs CMP.S.
;
define i1 @cmp32_ptest_first_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_first_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    ptest p0, p1.b
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %3)
  ret i1 %4
}

;
; PTEST_LAST(PG, CMP32(PG, A, B)). Can't remove PTEST since PTEST.B vs CMP.S.
;
define i1 @cmp32_ptest_last_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_last_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    ptest p0, p1.b
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %3)
  ret i1 %4
}

;
; PTEST_ANY(PG, CMP32(PG, A, B)). PTEST is redundant.
;
define i1 @cmp32_ptest_any_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_any_px:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %3)
  ret i1 %4
}

;
; Same as above except PG = incorrectly sized PTRUE
;
define i1 @cmp32_ptest_any_px_bad_ptrue(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_any_px_bad_ptrue:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.b
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %4 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %3)
  %5 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %4)
  ret i1 %5
}

; ==============================================================================
; PTEST_OP(X=CMP(PG, ...), X)
; ==============================================================================

;
; PTEST_FIRST(X=CMP8(PG, A, B), X). PTEST is redundant if condition is changed
; to any.
;
; Can't remove PTEST and keep the same condition (first), since the mask for
; the implicit PTEST (PG) performed by the compare differs from the mask
; specified to the explicit PTEST and could have a different result.
;
; For example, consider
;
;   PG=<1, 1, x, x>
;   Z0=<1, 2, x, x>
;   Z1=<2, 1, x, x>
;
;   X=CMPLE(PG, Z0, Z1)
;    =<0, 1, x, x>       NZCV=0xxx
;   PTEST(X, X),         NZCV=1xxx
;
; where the first active flag (bit 'N' in NZCV) is set by the explicit PTEST,
; but not by the implicit PTEST as part of the compare. However, given the
; PTEST mask and source are the same, first is equivalent to any. The same
; applies to last active.
;
define i1 @cmp8_ptest_first_xx(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_first_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; PTEST_LAST(X=CMP8(PG, A, B), X). PTEST is redundant if condition is changed
; to any.
;
define i1 @cmp8_ptest_last_xx(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_last_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; PTEST_ANY(X=CMP8(PG, A, B), X). PTEST is redundant.
;
define i1 @cmp8_ptest_any_xx(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_any_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %1)
  ret i1 %2
}

;
; PTEST_FIRST(X=CMP32(PG, A, B), X). PTEST is redundant if condition is changed
; to any.
;
define i1 @cmp32_ptest_first_xx(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_first_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %2, <vscale x 4 x i1> %2)
  ret i1 %3
}

;
; PTEST_LAST(X=CMP32(PG, A, B), X). PTEST is redundant if condition is changed
; to any.
;
define i1 @cmp32_ptest_last_xx(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_last_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv4i1(<vscale x 4 x i1> %2, <vscale x 4 x i1> %2)
  ret i1 %3
}

;
; PTEST_ANY(X=CMP32(PG, A, B), X). PTEST is redundant.
;
define i1 @cmp32_ptest_any_xx(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_any_xx:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1> %2, <vscale x 4 x i1> %2)
  ret i1 %3
}

; ==============================================================================
; PTEST_OP(PTRUE_ALL, CMP(PG, ...))
; ==============================================================================

;
; PTEST_FIRST(PTRUE_ALL, CMP8(PG, A, B)). Can't remove PTEST since mask is
; different.
;
define i1 @cmp8_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_first_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    ptrue p1.b
; CHECK-NEXT:    ptest p1, p0.b
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %2, <vscale x 16 x i1> %1)
  ret i1 %3
}

;
; PTEST_LAST(PTRUE_ALL, CMP8(PG, A, B)). Can't remove PTEST since mask is
; different.
;
define i1 @cmp8_ptest_last_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_last_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    ptrue p1.b
; CHECK-NEXT:    ptest p1, p0.b
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %2, <vscale x 16 x i1> %1)
  ret i1 %3
}

;
; PTEST_ANY(PTRUE_ALL, CMP8(PG, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_any_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_any_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %2, <vscale x 16 x i1> %1)
  ret i1 %3
}

;
; PTEST_FIRST(PTRUE_ALL, CMP32(PG, A, B)). Can't remove PTEST since mask is
; different.
;
define i1 @cmp32_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_first_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    ptrue p1.s
; CHECK-NEXT:    ptest p1, p0.b
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %3, <vscale x 4 x i1> %2)
  ret i1 %4
}

;
; PTEST_LAST(PTRUE_ALL, CMP32(PG, A, B)). Can't remove PTEST since mask is
; different.
;
define i1 @cmp32_ptest_last_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_last_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    ptrue p1.s
; CHECK-NEXT:    ptest p1, p0.b
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv4i1(<vscale x 4 x i1> %3, <vscale x 4 x i1> %2)
  ret i1 %4
}

;
; PTEST_ANY(PTRUE_ALL, CMP32(PG, A, B)). PTEST is redundant.
;
define i1 @cmp32_ptest_any_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_any_ax:
; CHECK:       // %bb.0:
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %4 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1> %3, <vscale x 4 x i1> %2)
  ret i1 %4
}

; ==============================================================================
; PTEST_OP(PTRUE_ALL, CMP(PTRUE_ALL, ...))
; ==============================================================================

;
; PTEST_FIRST(PTRUE_ALL, CMP8(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_first_aa(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_first_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.b
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
  ret i1 %3
}

;
; PTEST_LAST(PTRUE_ALL, CMP8(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_last_aa(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_last_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.b
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
  ret i1 %3
}

;
; PTEST_ANY(PTRUE_ALL, CMP8(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp8_ptest_any_aa(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: cmp8_ptest_any_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.b
; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1> %1, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
  ret i1 %3
}

;
; PTEST_FIRST(PTRUE_ALL, CMP32(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp32_ptest_first_aa(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_first_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.s
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, mi
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %2)
  ret i1 %3
}

;
; PTEST_LAST(PTRUE_ALL, CMP32(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp32_ptest_last_aa(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_last_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.s
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, lo
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.last.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %2)
  ret i1 %3
}

;
; PTEST_ANY(PTRUE_ALL, CMP32(PTRUE_ALL, A, B)). PTEST is redundant.
;
define i1 @cmp32_ptest_any_aa(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: cmp32_ptest_any_aa:
; CHECK:       // %bb.0:
; CHECK-NEXT:    ptrue p0.s
; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
; CHECK-NEXT:    cset w0, ne
; CHECK-NEXT:    ret
  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
  %3 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1> %1, <vscale x 4 x i1> %2)
  ret i1 %3
}

declare <vscale x 16 x i1> @llvm.aarch64.sve.cmpge.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.cmple.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
declare <vscale x 8 x i1> @llvm.aarch64.sve.cmple.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
declare <vscale x 4 x i1> @llvm.aarch64.sve.cmple.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)

declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)

declare i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
declare i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
declare i1 @llvm.aarch64.sve.ptest.last.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)

declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)

declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)

declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)