llvm/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s

; This file tests the following combinations related to streaming-enabled functions:
; [ ] N  ->  S    (Normal -> Streaming)
; [ ] S  ->  N    (Streaming -> Normal)
; [ ] S  ->  S    (Streaming -> Streaming)
; [ ] S  ->  SC   (Streaming -> Streaming-compatible)
;
; The following combination is tested in sme-streaming-compatible-interface.ll
; [ ] SC ->  S    (Streaming-compatible -> Streaming)

declare void @normal_callee()
declare void @streaming_callee() "aarch64_pstate_sm_enabled"
declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"

; [x] N  ->  S
; [ ] S  ->  N
; [ ] S  ->  S
; [ ] S  ->  SC
define void @normal_caller_streaming_callee() nounwind {
; CHECK-LABEL: normal_caller_streaming_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_callee()
  ret void;
}

; [ ] N  ->  S
; [x] S  ->  N
; [ ] S  ->  S
; [ ] S  ->  SC
define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" {
; CHECK-LABEL: streaming_caller_normal_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    bl normal_callee
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  call void @normal_callee()
  ret void;
}

; [ ] N  ->  S
; [ ] S  ->  N
; [x] S  ->  S
; [ ] S  ->  SC
define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" {
; CHECK-LABEL: streaming_caller_streaming_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_callee()
  ret void;
}

; [ ] N  ->  S
; [ ] S  ->  N
; [ ] S  ->  S
; [x] S  ->  SC
define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" {
; CHECK-LABEL: streaming_caller_streaming_compatible_callee:
; CHECK:       // %bb.0:
; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    bl streaming_compatible_callee
; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_compatible_callee()
  ret void;
}

;
; Handle special cases here.
;

; Call to function-pointer (with attribute)
define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    blr x0
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  call void %p() "aarch64_pstate_sm_enabled"
  ret void
}

; Ensure NEON registers are preserved correctly.
define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_simdfp:
; CHECK:       // %bb.0:
; CHECK-NEXT:    sub sp, sp, #96
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #96
; CHECK-NEXT:    ret
  call void @streaming_callee()
  ret <4 x i32> %x;
}

; Ensure SVE registers are preserved correctly.
define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT:    addvl sp, sp, #-18
; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    addvl sp, sp, #-1
; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    addvl sp, sp, #1
; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    addvl sp, sp, #18
; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_callee()
  ret <vscale x 4 x i32> %x;
}

; Call streaming callee twice; there should be no spills/fills between the two
; calls since the registers should have already been clobbered.
define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve_duplicate:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT:    addvl sp, sp, #-18
; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
; CHECK-NEXT:    addvl sp, sp, #-1
; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    addvl sp, sp, #1
; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT:    addvl sp, sp, #18
; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  call void @streaming_callee()
  call void @streaming_callee()
  ret <vscale x 4 x i32> %x;
}

; Ensure smstart is not removed, because call to llvm.cos is not part of a chain.
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    sub sp, sp, #96
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
; CHECK-NEXT:    bl cos
; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ldp d1, d0, [sp] // 16-byte Folded Reload
; CHECK-NEXT:    fadd d0, d1, d0
; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #96
; CHECK-NEXT:    ret
entry:
  %res = call fast double @llvm.cos.f64(double %x)
  %res.fadd = fadd fast double %res, %x
  ret double %res.fadd
}

declare double @llvm.cos.f64(double)

; Ensure that tail call optimization is disabled when the streaming mode
; doesn't match.
define void @disable_tailcallopt() nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK:       // %bb.0:
; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    bl streaming_callee
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT:    ret
  tail call void @streaming_callee()
  ret void;
}

define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT:    addvl sp, sp, #-3
; CHECK-NEXT:    rdsvl x3, #1
; CHECK-NEXT:    addvl x0, sp, #2
; CHECK-NEXT:    addvl x1, sp, #1
; CHECK-NEXT:    mov x2, sp
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    bl foo
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ptrue p0.b
; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp, #2, mul vl]
; CHECK-NEXT:    fmov w0, s0
; CHECK-NEXT:    addvl sp, sp, #3
; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT:    ret
entry:
  %Data1 = alloca <vscale x 16 x i8>, align 16
  %Data2 = alloca <vscale x 16 x i8>, align 16
  %Data3 = alloca <vscale x 16 x i8>, align 16
  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
  call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
  %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
  %vecext = extractelement <vscale x 16 x i8> %1, i64 0
  ret i8 %vecext
}

define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    sub sp, sp, #112
; CHECK-NEXT:    cntd x9
; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT:    smstop sm
; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    bl bar
; CHECK-NEXT:    smstart sm
; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    add sp, sp, #112
; CHECK-NEXT:    ret
entry:
  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
  ret void
}

declare i64 @llvm.aarch64.sme.cntsb()

declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)

attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }