llvm/llvm/test/CodeGen/AArch64/stack-probing-64k.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s

; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.

; 64k bytes is the largest frame we can probe in one go.
define void @static_65536(ptr %out) #0 {
; CHECK-LABEL: static_65536:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 65536, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 64k+16 bytes, still needs just one probe.
define void @static_65552(ptr %out) #0 {
; CHECK-LABEL: static_65552:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp], #-16
; CHECK-NEXT:    .cfi_def_cfa_offset 65568
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 32
; CHECK-NEXT:    add sp, sp, #16
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 65552, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 64k+1024 bytes, the largest frame which needs just one probe.
define void @static_66560(ptr %out) #0 {
; CHECK-LABEL: static_66560:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 66576
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 1040
; CHECK-NEXT:    add sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 66560, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 64k+1024+16 bytes, the smallest frame which needs two probes.
define void @static_66576(ptr %out) #0 {
; CHECK-LABEL: static_66576:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #1040
; CHECK-NEXT:    .cfi_def_cfa_offset 66592
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 1056
; CHECK-NEXT:    add sp, sp, #1040
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 66576, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 2*64k+1024, the largest frame needing two probes.
define void @static_132096(ptr %out) #0 {
; CHECK-LABEL: static_132096:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 131088
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 132112
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #32, lsl #12 // =131072
; CHECK-NEXT:    .cfi_def_cfa_offset 1040
; CHECK-NEXT:    add sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 132096, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 5*64k-16, the largest frame probed without a loop.
define void @static_327664(ptr %out) #0 {
; CHECK-LABEL: static_327664:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 65552
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 131088
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 196624
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    .cfi_def_cfa_offset 262160
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
; CHECK-NEXT:    .cfi_def_cfa_offset 323600
; CHECK-NEXT:    sub sp, sp, #4080
; CHECK-NEXT:    .cfi_def_cfa_offset 327680
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #79, lsl #12 // =323584
; CHECK-NEXT:    .cfi_def_cfa_offset 4096
; CHECK-NEXT:    add sp, sp, #4080
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 327664, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 5*64k, smallest frame probed with a loop.
define void @static_327680(ptr %out) #0 {
; CHECK-LABEL: static_327680:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa w9, 327696
; CHECK-NEXT:  .LBB6_1: // %entry
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    cmp sp, x9
; CHECK-NEXT:    b.ne .LBB6_1
; CHECK-NEXT:  // %bb.2: // %entry
; CHECK-NEXT:    .cfi_def_cfa_register wsp
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 327680, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB
; so has a reminder, but no extra probe.
define void @static_328704(ptr %out) #0 {
; CHECK-LABEL: static_328704:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa w9, 327696
; CHECK-NEXT:  .LBB7_1: // %entry
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    cmp sp, x9
; CHECK-NEXT:    b.ne .LBB7_1
; CHECK-NEXT:  // %bb.2: // %entry
; CHECK-NEXT:    .cfi_def_cfa_register wsp
; CHECK-NEXT:    sub sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 328720
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa_offset 1040
; CHECK-NEXT:    add sp, sp, #1024
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 328704, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; 5*64k+1040, large enough to use a loop, has a reminder and
; an extra probe.
define void @static_328720(ptr %out) #0 {
; CHECK-LABEL: static_328720:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa w9, 327696
; CHECK-NEXT:  .LBB8_1: // %entry
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    cmp sp, x9
; CHECK-NEXT:    b.ne .LBB8_1
; CHECK-NEXT:  // %bb.2: // %entry
; CHECK-NEXT:    .cfi_def_cfa_register wsp
; CHECK-NEXT:    sub sp, sp, #1040
; CHECK-NEXT:    .cfi_def_cfa_offset 328736
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
; CHECK-NEXT:    .cfi_def_cfa_offset 1056
; CHECK-NEXT:    add sp, sp, #1040
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 328720, align 1
  store ptr %v, ptr %out, align 8
  ret void
}

; A small allocation, but with a very large alignment requirement. We do this
; by moving SP far enough that a sufficiently-aligned block will exist
; somewhere in the stack frame, so must probe the whole of that larger SP move.
define void @static_16_align_131072(ptr %out) #0 {
; CHECK-LABEL: static_16_align_131072:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    mov x29, sp
; CHECK-NEXT:    .cfi_def_cfa w29, 16
; CHECK-NEXT:    .cfi_offset w30, -8
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #31, lsl #12 // =126976
; CHECK-NEXT:    sub x9, x9, #4080
; CHECK-NEXT:    and x9, x9, #0xfffffffffffe0000
; CHECK-NEXT:  .LBB9_1: // %entry
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT:    cmp sp, x9
; CHECK-NEXT:    b.le .LBB9_3
; CHECK-NEXT:  // %bb.2: // %entry
; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    b .LBB9_1
; CHECK-NEXT:  .LBB9_3: // %entry
; CHECK-NEXT:    mov sp, x9
; CHECK-NEXT:    ldr xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    mov sp, x29
; CHECK-NEXT:    .cfi_def_cfa wsp, 16
; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w30
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 16, align 131072
  store ptr %v, ptr %out, align 8
  ret void
}

; A small allocation, but with a very large alignment requirement which
; is nevertheless small enough as to not need a loop.
define void @static_16_align_8192(ptr %out) #0 {
; CHECK-LABEL: static_16_align_8192:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    mov x29, sp
; CHECK-NEXT:    .cfi_def_cfa w29, 16
; CHECK-NEXT:    .cfi_offset w30, -8
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
; CHECK-NEXT:    sub x9, x9, #4080
; CHECK-NEXT:    and sp, x9, #0xffffffffffffe000
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    mov sp, x29
; CHECK-NEXT:    .cfi_def_cfa wsp, 16
; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w30
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 16, align 8192
  store ptr %v, ptr %out, align 8
  ret void
}

; A large allocation with a very large alignment requirement which
; is nevertheless small enough as to not need a loop.
define void @static_32752_align_32k(ptr %out) #0 {
; CHECK-LABEL: static_32752_align_32k:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 16
; CHECK-NEXT:    mov x29, sp
; CHECK-NEXT:    .cfi_def_cfa w29, 16
; CHECK-NEXT:    .cfi_offset w30, -8
; CHECK-NEXT:    .cfi_offset w29, -16
; CHECK-NEXT:    sub x9, sp, #7, lsl #12 // =28672
; CHECK-NEXT:    sub x9, x9, #4080
; CHECK-NEXT:    and sp, x9, #0xffffffffffff8000
; CHECK-NEXT:    str xzr, [sp]
; CHECK-NEXT:    mov x8, sp
; CHECK-NEXT:    str x8, [x0]
; CHECK-NEXT:    mov sp, x29
; CHECK-NEXT:    .cfi_def_cfa wsp, 16
; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT:    .cfi_def_cfa_offset 0
; CHECK-NEXT:    .cfi_restore w30
; CHECK-NEXT:    .cfi_restore w29
; CHECK-NEXT:    ret
entry:
  %v = alloca i8, i64 32752, align 32768
  store ptr %v, ptr %out, align 8
  ret void
}

attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" }