llvm/llvm/test/CodeGen/X86/sse-regcall.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s

; Test regcall when receiving/returning i1
define x86_regcallcc i1 @test_argReti1(i1 %a)  {
; WIN32-LABEL: test_argReti1:
; WIN32:       # %bb.0:
; WIN32-NEXT:    incb %al
; WIN32-NEXT:    # kill: def $al killed $al killed $eax
; WIN32-NEXT:    retl
;
; WIN64-LABEL: test_argReti1:
; WIN64:       # %bb.0:
; WIN64-NEXT:    incb %al
; WIN64-NEXT:    # kill: def $al killed $al killed $eax
; WIN64-NEXT:    retq
;
; LINUXOSX-LABEL: test_argReti1:
; LINUXOSX:       # %bb.0:
; LINUXOSX-NEXT:    incb %al
; LINUXOSX-NEXT:    # kill: def $al killed $al killed $eax
; LINUXOSX-NEXT:    retq
  %add = add i1 %a, 1
  ret i1 %add
}

; Test regcall when passing/retrieving i1
define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
; WIN32-LABEL: test_CallargReti1:
; WIN32:       # %bb.0:
; WIN32-NEXT:    incb %al
; WIN32-NEXT:    movzbl %al, %eax
; WIN32-NEXT:    calll _test_argReti1
; WIN32-NEXT:    incb %al
; WIN32-NEXT:    retl
;
; WIN64-LABEL: test_CallargReti1:
; WIN64:       # %bb.0:
; WIN64-NEXT:    pushq %rax
; WIN64-NEXT:    .seh_stackalloc 8
; WIN64-NEXT:    .seh_endprologue
; WIN64-NEXT:    incb %al
; WIN64-NEXT:    movzbl %al, %eax
; WIN64-NEXT:    callq test_argReti1
; WIN64-NEXT:    incb %al
; WIN64-NEXT:    popq %rcx
; WIN64-NEXT:    retq
; WIN64-NEXT:    .seh_endproc
;
; LINUXOSX-LABEL: test_CallargReti1:
; LINUXOSX:       # %bb.0:
; LINUXOSX-NEXT:    pushq %rax
; LINUXOSX-NEXT:    .cfi_def_cfa_offset 16
; LINUXOSX-NEXT:    incb %al
; LINUXOSX-NEXT:    movzbl %al, %eax
; LINUXOSX-NEXT:    callq *test_argReti1@GOTPCREL(%rip)
; LINUXOSX-NEXT:    incb %al
; LINUXOSX-NEXT:    popq %rcx
; LINUXOSX-NEXT:    .cfi_def_cfa_offset 8
; LINUXOSX-NEXT:    retq
  %b = add i1 %a, 1
  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
  %d = add i1 %c, 1
  ret i1 %d
}

;test calling conventions - input parameters, callee saved xmms
define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
; WIN32-LABEL: testf32_inp:
; WIN32:       # %bb.0:
; WIN32-NEXT:    pushl %ebp
; WIN32-NEXT:    movl %esp, %ebp
; WIN32-NEXT:    andl $-16, %esp
; WIN32-NEXT:    subl $32, %esp
; WIN32-NEXT:    movaps %xmm7, (%esp) # 16-byte Spill
; WIN32-NEXT:    movaps %xmm6, %xmm7
; WIN32-NEXT:    movaps %xmm5, %xmm6
; WIN32-NEXT:    movaps %xmm4, %xmm5
; WIN32-NEXT:    movaps %xmm1, %xmm4
; WIN32-NEXT:    movaps %xmm0, %xmm1
; WIN32-NEXT:    addps %xmm5, %xmm0
; WIN32-NEXT:    mulps %xmm5, %xmm1
; WIN32-NEXT:    subps %xmm1, %xmm0
; WIN32-NEXT:    movups 8(%ebp), %xmm1
; WIN32-NEXT:    addps %xmm1, %xmm0
; WIN32-NEXT:    movaps %xmm4, %xmm1
; WIN32-NEXT:    addps %xmm6, %xmm1
; WIN32-NEXT:    mulps %xmm6, %xmm4
; WIN32-NEXT:    subps %xmm4, %xmm1
; WIN32-NEXT:    movups 24(%ebp), %xmm4
; WIN32-NEXT:    addps %xmm4, %xmm1
; WIN32-NEXT:    movaps %xmm2, %xmm4
; WIN32-NEXT:    addps %xmm7, %xmm4
; WIN32-NEXT:    mulps %xmm7, %xmm2
; WIN32-NEXT:    subps %xmm2, %xmm4
; WIN32-NEXT:    movups 40(%ebp), %xmm2
; WIN32-NEXT:    addps %xmm2, %xmm4
; WIN32-NEXT:    movaps %xmm3, %xmm5
; WIN32-NEXT:    movaps (%esp), %xmm2 # 16-byte Reload
; WIN32-NEXT:    addps %xmm2, %xmm5
; WIN32-NEXT:    mulps %xmm2, %xmm3
; WIN32-NEXT:    subps %xmm3, %xmm5
; WIN32-NEXT:    movups 56(%ebp), %xmm2
; WIN32-NEXT:    addps %xmm2, %xmm5
; WIN32-NEXT:    movaps %xmm4, %xmm2
; WIN32-NEXT:    movaps %xmm5, %xmm3
; WIN32-NEXT:    movl %ebp, %esp
; WIN32-NEXT:    popl %ebp
; WIN32-NEXT:    retl
;
; WIN64-LABEL: testf32_inp:
; WIN64:       # %bb.0:
; WIN64-NEXT:    subq $72, %rsp
; WIN64-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; WIN64-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; WIN64-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; WIN64-NEXT:    movaps %xmm12, (%rsp) # 16-byte Spill
; WIN64-NEXT:    movaps %xmm0, %xmm12
; WIN64-NEXT:    addps %xmm4, %xmm12
; WIN64-NEXT:    movaps %xmm1, %xmm13
; WIN64-NEXT:    addps %xmm5, %xmm13
; WIN64-NEXT:    movaps %xmm2, %xmm14
; WIN64-NEXT:    addps %xmm6, %xmm14
; WIN64-NEXT:    movaps %xmm3, %xmm15
; WIN64-NEXT:    addps %xmm7, %xmm15
; WIN64-NEXT:    mulps %xmm4, %xmm0
; WIN64-NEXT:    subps %xmm0, %xmm12
; WIN64-NEXT:    mulps %xmm5, %xmm1
; WIN64-NEXT:    subps %xmm1, %xmm13
; WIN64-NEXT:    mulps %xmm6, %xmm2
; WIN64-NEXT:    subps %xmm2, %xmm14
; WIN64-NEXT:    mulps %xmm7, %xmm3
; WIN64-NEXT:    subps %xmm3, %xmm15
; WIN64-NEXT:    addps %xmm8, %xmm12
; WIN64-NEXT:    addps %xmm9, %xmm13
; WIN64-NEXT:    addps %xmm10, %xmm14
; WIN64-NEXT:    addps %xmm11, %xmm15
; WIN64-NEXT:    movaps %xmm12, %xmm0
; WIN64-NEXT:    movaps %xmm13, %xmm1
; WIN64-NEXT:    movaps %xmm14, %xmm2
; WIN64-NEXT:    movaps %xmm15, %xmm3
; WIN64-NEXT:    movaps (%rsp), %xmm12 # 16-byte Reload
; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; WIN64-NEXT:    addq $72, %rsp
; WIN64-NEXT:    retq
;
; LINUXOSX-LABEL: testf32_inp:
; LINUXOSX:       # %bb.0:
; LINUXOSX-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUXOSX-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUXOSX-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUXOSX-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; LINUXOSX-NEXT:    movaps %xmm0, %xmm12
; LINUXOSX-NEXT:    addps %xmm4, %xmm12
; LINUXOSX-NEXT:    movaps %xmm1, %xmm13
; LINUXOSX-NEXT:    addps %xmm5, %xmm13
; LINUXOSX-NEXT:    movaps %xmm2, %xmm14
; LINUXOSX-NEXT:    addps %xmm6, %xmm14
; LINUXOSX-NEXT:    movaps %xmm3, %xmm15
; LINUXOSX-NEXT:    addps %xmm7, %xmm15
; LINUXOSX-NEXT:    mulps %xmm4, %xmm0
; LINUXOSX-NEXT:    subps %xmm0, %xmm12
; LINUXOSX-NEXT:    mulps %xmm5, %xmm1
; LINUXOSX-NEXT:    subps %xmm1, %xmm13
; LINUXOSX-NEXT:    mulps %xmm6, %xmm2
; LINUXOSX-NEXT:    subps %xmm2, %xmm14
; LINUXOSX-NEXT:    mulps %xmm7, %xmm3
; LINUXOSX-NEXT:    subps %xmm3, %xmm15
; LINUXOSX-NEXT:    addps %xmm8, %xmm12
; LINUXOSX-NEXT:    addps %xmm9, %xmm13
; LINUXOSX-NEXT:    addps %xmm10, %xmm14
; LINUXOSX-NEXT:    addps %xmm11, %xmm15
; LINUXOSX-NEXT:    movaps %xmm12, %xmm0
; LINUXOSX-NEXT:    movaps %xmm13, %xmm1
; LINUXOSX-NEXT:    movaps %xmm14, %xmm2
; LINUXOSX-NEXT:    movaps %xmm15, %xmm3
; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; LINUXOSX-NEXT:    retq
  %x1 = fadd <16 x float> %a, %b
  %x2 = fmul <16 x float> %a, %b
  %x3 = fsub <16 x float> %x1, %x2
  %x4 = fadd <16 x float> %x3, %c
  ret <16 x float> %x4
}

;test calling conventions - input parameters, callee saved GPRs
define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6,
; WIN32-LABEL: testi32_inp:
; WIN32:       # %bb.0:
; WIN32-NEXT:    pushl %ebp
; WIN32-NEXT:    pushl %ebx
; WIN32-NEXT:    subl $12, %esp
; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT:    movl %eax, %ebp
; WIN32-NEXT:    leal (%edx,%edi), %eax
; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT:    movl %edx, %eax
; WIN32-NEXT:    subl %edi, %eax
; WIN32-NEXT:    movl %ebp, %edx
; WIN32-NEXT:    subl %ecx, %edx
; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT:    imull %edx, %ebx
; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT:    movl %esi, %edx
; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT:    imull %eax, %edx
; WIN32-NEXT:    addl %ebx, %edx
; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT:    movl (%esp), %edi # 4-byte Reload
; WIN32-NEXT:    subl %ebx, %edi
; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT:    movl %ecx, %eax
; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT:    imull %edi, %eax
; WIN32-NEXT:    addl %edx, %eax
; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; WIN32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT:    imull %edx, %ebp
; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; WIN32-NEXT:    addl %esi, %ebp
; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT:    imull %ebx, %ecx
; WIN32-NEXT:    addl %ecx, %ebp
; WIN32-NEXT:    addl %eax, %ebp
; WIN32-NEXT:    movl %ebp, %eax
; WIN32-NEXT:    addl $12, %esp
; WIN32-NEXT:    popl %ebx
; WIN32-NEXT:    popl %ebp
; WIN32-NEXT:    retl
;
; WIN64-LABEL: testi32_inp:
; WIN64:       # %bb.0:
; WIN64-NEXT:    pushq %rbp
; WIN64-NEXT:    pushq %rbx
; WIN64-NEXT:    # kill: def $edx killed $edx def $rdx
; WIN64-NEXT:    # kill: def $esi killed $esi def $rsi
; WIN64-NEXT:    # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT:    # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT:    # kill: def $r12d killed $r12d def $r12
; WIN64-NEXT:    # kill: def $r11d killed $r11d def $r11
; WIN64-NEXT:    # kill: def $r10d killed $r10d def $r10
; WIN64-NEXT:    # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT:    # kill: def $r8d killed $r8d def $r8
; WIN64-NEXT:    # kill: def $edi killed $edi def $rdi
; WIN64-NEXT:    leal (%rdx,%rdi), %ebx
; WIN64-NEXT:    movl %edx, %ebp
; WIN64-NEXT:    subl %edi, %ebp
; WIN64-NEXT:    leal (%rsi,%r8), %edx
; WIN64-NEXT:    # kill: def $esi killed $esi killed $rsi
; WIN64-NEXT:    subl %r8d, %esi
; WIN64-NEXT:    leal (%r9,%r10), %edi
; WIN64-NEXT:    movl %r9d, %r8d
; WIN64-NEXT:    subl %r10d, %r8d
; WIN64-NEXT:    movl %eax, %r9d
; WIN64-NEXT:    subl %ecx, %r9d
; WIN64-NEXT:    imull %r9d, %r8d
; WIN64-NEXT:    leal (%r11,%r12), %r9d
; WIN64-NEXT:    movl %r11d, %r10d
; WIN64-NEXT:    subl %r12d, %r10d
; WIN64-NEXT:    imull %ebp, %r10d
; WIN64-NEXT:    addl %r8d, %r10d
; WIN64-NEXT:    leal (%r14,%r15), %r8d
; WIN64-NEXT:    movl %r14d, %r11d
; WIN64-NEXT:    subl %r15d, %r11d
; WIN64-NEXT:    imull %esi, %r11d
; WIN64-NEXT:    addl %r10d, %r11d
; WIN64-NEXT:    addl %ecx, %eax
; WIN64-NEXT:    imull %edi, %eax
; WIN64-NEXT:    imull %ebx, %r9d
; WIN64-NEXT:    addl %r9d, %eax
; WIN64-NEXT:    imull %edx, %r8d
; WIN64-NEXT:    addl %r8d, %eax
; WIN64-NEXT:    addl %r11d, %eax
; WIN64-NEXT:    popq %rbx
; WIN64-NEXT:    popq %rbp
; WIN64-NEXT:    retq
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX:       # %bb.0:
; LINUXOSX-NEXT:    # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT:    # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT:    # kill: def $r14d killed $r14d def $r14
; LINUXOSX-NEXT:    # kill: def $r13d killed $r13d def $r13
; LINUXOSX-NEXT:    # kill: def $r12d killed $r12d def $r12
; LINUXOSX-NEXT:    # kill: def $r9d killed $r9d def $r9
; LINUXOSX-NEXT:    # kill: def $r8d killed $r8d def $r8
; LINUXOSX-NEXT:    # kill: def $edi killed $edi def $rdi
; LINUXOSX-NEXT:    leal (%rdx,%rdi), %r10d
; LINUXOSX-NEXT:    movl %edx, %r11d
; LINUXOSX-NEXT:    subl %edi, %r11d
; LINUXOSX-NEXT:    leal (%rsi,%r8), %edx
; LINUXOSX-NEXT:    # kill: def $esi killed $esi killed $rsi
; LINUXOSX-NEXT:    subl %r8d, %esi
; LINUXOSX-NEXT:    leal (%r9,%r12), %edi
; LINUXOSX-NEXT:    movl %r9d, %r8d
; LINUXOSX-NEXT:    subl %r12d, %r8d
; LINUXOSX-NEXT:    movl %eax, %r9d
; LINUXOSX-NEXT:    subl %ecx, %r9d
; LINUXOSX-NEXT:    imull %r9d, %r8d
; LINUXOSX-NEXT:    leal (%r13,%r14), %r9d
; LINUXOSX-NEXT:    movl %r13d, %r12d
; LINUXOSX-NEXT:    subl %r14d, %r12d
; LINUXOSX-NEXT:    imull %r11d, %r12d
; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
; LINUXOSX-NEXT:    addl %r8d, %r12d
; LINUXOSX-NEXT:    movl %r15d, %r8d
; LINUXOSX-NEXT:    subl %r11d, %r8d
; LINUXOSX-NEXT:    imull %esi, %r8d
; LINUXOSX-NEXT:    addl %r12d, %r8d
; LINUXOSX-NEXT:    addl %ecx, %eax
; LINUXOSX-NEXT:    imull %edi, %eax
; LINUXOSX-NEXT:    imull %r10d, %r9d
; LINUXOSX-NEXT:    addl %r9d, %eax
; LINUXOSX-NEXT:    addl %r15d, %r11d
; LINUXOSX-NEXT:    imull %edx, %r11d
; LINUXOSX-NEXT:    addl %r11d, %eax
; LINUXOSX-NEXT:    addl %r8d, %eax
; LINUXOSX-NEXT:    retq
                                      i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind {
  %x1 = sub i32 %a1, %a2
  %x2 = sub i32 %a3, %a4
  %x3 = sub i32 %a5, %a6
  %y1 = sub i32 %b1, %b2
  %y2 = sub i32 %b3, %b4
  %y3 = sub i32 %b5, %b6
  %v1 = add i32 %a1, %a2
  %v2 = add i32 %a3, %a4
  %v3 = add i32 %a5, %a6
  %w1 = add i32 %b1, %b2
  %w2 = add i32 %b3, %b4
  %w3 = add i32 %b5, %b6
  %s1 = mul i32 %x1, %y1
  %s2 = mul i32 %x2, %y2
  %s3 = mul i32 %x3, %y3
  %t1 = mul i32 %v1, %w1
  %t2 = mul i32 %v2, %w2
  %t3 = mul i32 %v3, %w3
  %m1 = add i32 %s1, %s2
  %m2 = add i32 %m1, %s3
  %n1 = add i32 %t1, %t2
  %n2 = add i32 %n1, %t3
  %r1 = add i32 %m2, %n2
  ret i32 %r1
}

; Test that parameters, overflowing register capacity, are passed through the stack
define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind {
; WIN32-LABEL: testf32_stack:
; WIN32:       # %bb.0:
; WIN32-NEXT:    pushl %ebp
; WIN32-NEXT:    movl %esp, %ebp
; WIN32-NEXT:    andl $-16, %esp
; WIN32-NEXT:    subl $48, %esp
; WIN32-NEXT:    movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; WIN32-NEXT:    movaps %xmm6, (%esp) # 16-byte Spill
; WIN32-NEXT:    movaps %xmm5, %xmm6
; WIN32-NEXT:    movaps %xmm4, %xmm5
; WIN32-NEXT:    movaps %xmm3, %xmm4
; WIN32-NEXT:    movaps %xmm2, %xmm3
; WIN32-NEXT:    movaps %xmm1, %xmm2
; WIN32-NEXT:    movaps %xmm0, %xmm1
; WIN32-NEXT:    movups 120(%ebp), %xmm7
; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; WIN32-NEXT:    addps %xmm7, %xmm0
; WIN32-NEXT:    movups 248(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm0
; WIN32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; WIN32-NEXT:    movups 104(%ebp), %xmm7
; WIN32-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
; WIN32-NEXT:    addps %xmm7, %xmm0
; WIN32-NEXT:    movups 232(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm0
; WIN32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
; WIN32-NEXT:    movups 88(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm6
; WIN32-NEXT:    movups 216(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm6
; WIN32-NEXT:    movups 72(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm5
; WIN32-NEXT:    movups 200(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm5
; WIN32-NEXT:    movups 56(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm4
; WIN32-NEXT:    movups 184(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm4
; WIN32-NEXT:    movups 40(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm3
; WIN32-NEXT:    movups 168(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm3
; WIN32-NEXT:    movups 24(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm2
; WIN32-NEXT:    movups 152(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm2
; WIN32-NEXT:    movups 8(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm1
; WIN32-NEXT:    movups 136(%ebp), %xmm7
; WIN32-NEXT:    addps %xmm7, %xmm1
; WIN32-NEXT:    movaps %xmm1, %xmm0
; WIN32-NEXT:    movaps %xmm2, %xmm1
; WIN32-NEXT:    movaps %xmm3, %xmm2
; WIN32-NEXT:    movaps %xmm4, %xmm3
; WIN32-NEXT:    movaps %xmm5, %xmm4
; WIN32-NEXT:    movaps %xmm6, %xmm5
; WIN32-NEXT:    movaps (%esp), %xmm6 # 16-byte Reload
; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
; WIN32-NEXT:    movl %ebp, %esp
; WIN32-NEXT:    popl %ebp
; WIN32-NEXT:    retl
;
; WIN64-LABEL: testf32_stack:
; WIN64:       # %bb.0:
; WIN64-NEXT:    pushq %rax
; WIN64-NEXT:    addps %xmm15, %xmm7
; WIN64-NEXT:    addps %xmm14, %xmm6
; WIN64-NEXT:    addps %xmm13, %xmm5
; WIN64-NEXT:    addps %xmm12, %xmm4
; WIN64-NEXT:    addps %xmm11, %xmm3
; WIN64-NEXT:    addps %xmm10, %xmm2
; WIN64-NEXT:    addps %xmm9, %xmm1
; WIN64-NEXT:    addps %xmm8, %xmm0
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm4
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm5
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm6
; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm7
; WIN64-NEXT:    popq %rax
; WIN64-NEXT:    retq
;
; LINUXOSX-LABEL: testf32_stack:
; LINUXOSX:       # %bb.0:
; LINUXOSX-NEXT:    addps %xmm15, %xmm7
; LINUXOSX-NEXT:    addps %xmm14, %xmm6
; LINUXOSX-NEXT:    addps %xmm13, %xmm5
; LINUXOSX-NEXT:    addps %xmm12, %xmm4
; LINUXOSX-NEXT:    addps %xmm11, %xmm3
; LINUXOSX-NEXT:    addps %xmm10, %xmm2
; LINUXOSX-NEXT:    addps %xmm9, %xmm1
; LINUXOSX-NEXT:    addps %xmm8, %xmm0
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm4
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm5
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm6
; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm7
; LINUXOSX-NEXT:    retq
  %x1 = fadd <32 x float> %a, %b
  %x2 = fadd <32 x float> %x1, %c
  ret <32 x float> %x2
}