llvm/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: opt -S --passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

; Sink the GEP to make use of scalar+vector addressing modes.
define <vscale x 4 x float> @gather_offsets_sink_gep(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offsets_sink_gep(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i32> %indices
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Sink sext to make use of scalar+sxtw(vector) addressing modes.
define <vscale x 4 x float> @gather_offsets_sink_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offsets_sink_sext(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %indices.sext = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i64> %indices.sext
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; As above but ensure both the GEP and sext is sunk.
define <vscale x 4 x float> @gather_offsets_sink_sext_get(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offsets_sink_sext_get(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %indices.sext = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i64> %indices.sext
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Don't sink GEPs that cannot benefit from SVE's scalar+vector addressing modes.
define <vscale x 4 x float> @gather_no_scalar_base(<vscale x 4 x ptr> %bases, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_no_scalar_base(
; CHECK-SAME: <vscale x 4 x ptr> [[BASES:%.*]], <vscale x 4 x i32> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, <vscale x 4 x ptr> [[BASES]], <vscale x 4 x i32> [[INDICES]]
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %ptrs = getelementptr float, <vscale x 4 x ptr> %bases, <vscale x 4 x i32> %indices
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Don't sink extends whose result type is already favourable for SVE's sxtw/uxtw addressing modes.
; NOTE: We still want to sink the GEP.
define <vscale x 4 x float> @gather_offset_type_too_small(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offset_type_too_small(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i8> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[INDICES_SEXT:%.*]] = sext <vscale x 4 x i8> [[INDICES]] to <vscale x 4 x i32>
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES_SEXT]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %indices.sext = sext <vscale x 4 x i8> %indices to <vscale x 4 x i32>
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i32> %indices.sext
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Don't sink extends that cannot benefit from SVE's sxtw/uxtw addressing modes.
; NOTE: We still want to sink the GEP.
define <vscale x 4 x float> @gather_offset_type_too_big(ptr %base, <vscale x 4 x i48> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offset_type_too_big(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i48> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[INDICES_SEXT:%.*]] = sext <vscale x 4 x i48> [[INDICES]] to <vscale x 4 x i64>
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[INDICES_SEXT]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %indices.sext = sext <vscale x 4 x i48> %indices to <vscale x 4 x i64>
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i64> %indices.sext
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Sink zext to make use of scalar+uxtw(vector) addressing modes.
; TODO: There's an argument here to split the extend into i8->i32 and i32->i64,
; which would be especially useful if the i8s are the result of a load because
; it would maintain the use of sign-extending loads.
define <vscale x 4 x float> @gather_offset_sink_zext(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define <vscale x 4 x float> @gather_offset_sink_zext(
; CHECK-SAME: ptr [[BASE:%.*]], <vscale x 4 x i8> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i8> [[INDICES]] to <vscale x 4 x i64>
; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
; CHECK:       exit:
; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
;
entry:
  %indices.zext = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64>
  br i1 %cond, label %cond.block, label %exit

cond.block:
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i64> %indices.zext
  %load = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> poison)
  br label %exit

exit:
  %ret = phi <vscale x 4 x float> [ zeroinitializer, %entry ], [ %load, %cond.block ]
  ret <vscale x 4 x float> %ret
}

; Ensure we support scatters as well as gathers.
define void @scatter_offsets_sink_sext_get(<vscale x 4 x float> %data, ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i1 %cond) {
; CHECK-LABEL: define void @scatter_offsets_sink_sext_get(
; CHECK-SAME: <vscale x 4 x float> [[DATA:%.*]], ptr [[BASE:%.*]], <vscale x 4 x i32> [[INDICES:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT:  entry:
; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
; CHECK:       cond.block:
; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[DATA]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]])
; CHECK-NEXT:    ret void
; CHECK:       exit:
; CHECK-NEXT:    ret void
;
entry:
  %indices.sext = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
  %ptrs = getelementptr float, ptr %base, <vscale x 4 x i64> %indices.sext
  br i1 %cond, label %cond.block, label %exit

cond.block:
  tail call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x ptr> %ptrs, i32 4, <vscale x 4 x i1> %mask)
  br label %exit

exit:
  ret void
}

declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x ptr>, i32, <vscale x 4 x i1>)