llvm/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.s

	.text
	.file	"matmul.c"
	.globl	init_array              # -- Begin function init_array
	.p2align	4, 0x90
	.type	init_array,@function
init_array:                             # @init_array
	.cfi_startproc
# %bb.0:                                # %entry
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%rbx
	pushq	%rax
	.cfi_offset %rbx, -24
	leaq	init_array_polly_subfn(%rip), %rdi
	leaq	-16(%rbp), %rbx
	xorl	%edx, %edx
	xorl	%ecx, %ecx
	movl	$1536, %r8d             # imm = 0x600
	movl	$1, %r9d
	movq	%rbx, %rsi
	callq	GOMP_parallel_loop_runtime_start@PLT
	movq	%rbx, %rdi
	callq	init_array_polly_subfn
	callq	GOMP_parallel_end@PLT
	addq	$8, %rsp
	popq	%rbx
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	retq
.Lfunc_end0:
	.size	init_array, .Lfunc_end0-init_array
	.cfi_endproc
                                        # -- End function
	.globl	print_array             # -- Begin function print_array
	.p2align	4, 0x90
	.type	print_array,@function
print_array:                            # @print_array
	.cfi_startproc
# %bb.0:                                # %entry
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	pushq	%rax
	.cfi_offset %rbx, -56
	.cfi_offset %r12, -48
	.cfi_offset %r13, -40
	.cfi_offset %r14, -32
	.cfi_offset %r15, -24
	leaq	C(%rip), %r13
	xorl	%eax, %eax
	movl	$3435973837, %r12d      # imm = 0xCCCCCCCD
	leaq	.L.str(%rip), %r14
	.p2align	4, 0x90
.LBB1_1:                                # %for.cond1.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB1_2 Depth 2
	movq	%rax, -48(%rbp)         # 8-byte Spill
	movq	stdout(%rip), %rsi
	xorl	%ebx, %ebx
	.p2align	4, 0x90
.LBB1_2:                                # %for.body3
                                        #   Parent Loop BB1_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	movl	%ebx, %eax
	imulq	%r12, %rax
	shrq	$38, %rax
	leal	(%rax,%rax,4), %r15d
	shll	$4, %r15d
	addl	$79, %r15d
	movss	(%r13,%rbx,4), %xmm0    # xmm0 = mem[0],zero,zero,zero
	cvtss2sd	%xmm0, %xmm0
	movb	$1, %al
	movq	%rsi, %rdi
	movq	%r14, %rsi
	callq	fprintf
	cmpl	%ebx, %r15d
	jne	.LBB1_4
# %bb.3:                                # %if.then
                                        #   in Loop: Header=BB1_2 Depth=2
	movq	stdout(%rip), %rsi
	movl	$10, %edi
	callq	fputc@PLT
.LBB1_4:                                # %for.inc
                                        #   in Loop: Header=BB1_2 Depth=2
	addq	$1, %rbx
	movq	stdout(%rip), %rsi
	cmpq	$1536, %rbx             # imm = 0x600
	jne	.LBB1_2
# %bb.5:                                # %for.end
                                        #   in Loop: Header=BB1_1 Depth=1
	movl	$10, %edi
	callq	fputc@PLT
	movq	-48(%rbp), %rax         # 8-byte Reload
	addq	$1, %rax
	addq	$6144, %r13             # imm = 0x1800
	cmpq	$1536, %rax             # imm = 0x600
	jne	.LBB1_1
# %bb.6:                                # %for.end12
	addq	$8, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	retq
.Lfunc_end1:
	.size	print_array, .Lfunc_end1-print_array
	.cfi_endproc
                                        # -- End function
	.globl	main                    # -- Begin function main
	.p2align	4, 0x90
	.type	main,@function
main:                                   # @main
	.cfi_startproc
# %bb.0:                                # %entry
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
	pushq	%rbx
	pushq	%rax
	.cfi_offset %rbx, -24
	callq	init_array
	leaq	main_polly_subfn(%rip), %rdi
	leaq	-16(%rbp), %rbx
	xorl	%edx, %edx
	xorl	%ecx, %ecx
	movl	$1536, %r8d             # imm = 0x600
	movl	$1, %r9d
	movq	%rbx, %rsi
	callq	GOMP_parallel_loop_runtime_start@PLT
	movq	%rbx, %rdi
	callq	main_polly_subfn
	callq	GOMP_parallel_end@PLT
	leaq	main_polly_subfn_1(%rip), %rdi
	xorl	%edx, %edx
	xorl	%ecx, %ecx
	movl	$1536, %r8d             # imm = 0x600
	movl	$64, %r9d
	movq	%rbx, %rsi
	callq	GOMP_parallel_loop_runtime_start@PLT
	movq	%rbx, %rdi
	callq	main_polly_subfn_1
	callq	GOMP_parallel_end@PLT
	xorl	%eax, %eax
	addq	$8, %rsp
	popq	%rbx
	popq	%rbp
	.cfi_def_cfa %rsp, 8
	retq
.Lfunc_end2:
	.size	main, .Lfunc_end2-main
	.cfi_endproc
                                        # -- End function
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3               # -- Begin function init_array_polly_subfn
.LCPI3_0:
	.quad	4602678819172646912     # double 0.5
	.text
	.p2align	4, 0x90
	.type	init_array_polly_subfn,@function
init_array_polly_subfn:                 # @init_array_polly_subfn
	.cfi_startproc
# %bb.0:                                # %polly.par.setup
	pushq	%r15
	.cfi_def_cfa_offset 16
	pushq	%r14
	.cfi_def_cfa_offset 24
	pushq	%r13
	.cfi_def_cfa_offset 32
	pushq	%r12
	.cfi_def_cfa_offset 40
	pushq	%rbx
	.cfi_def_cfa_offset 48
	subq	$16, %rsp
	.cfi_def_cfa_offset 64
	.cfi_offset %rbx, -48
	.cfi_offset %r12, -40
	.cfi_offset %r13, -32
	.cfi_offset %r14, -24
	.cfi_offset %r15, -16
	leaq	8(%rsp), %rdi
	movq	%rsp, %rsi
	callq	GOMP_loop_runtime_next@PLT
	testb	%al, %al
	je	.LBB3_2
# %bb.1:
	leaq	B(%rip), %r15
	leaq	A(%rip), %r12
	movsd	.LCPI3_0(%rip), %xmm1   # xmm1 = mem[0],zero
	leaq	8(%rsp), %r14
	movq	%rsp, %r13
	.p2align	4, 0x90
.LBB3_4:                                # %polly.par.loadIVBounds
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB3_5 Depth 2
                                        #       Child Loop BB3_6 Depth 3
	movq	8(%rsp), %rax
	movq	(%rsp), %r8
	decq	%r8
	movq	%rax, %rdx
	shlq	$11, %rdx
	leaq	(%rdx,%rdx,2), %rdx
	leaq	(%r15,%rdx), %rsi
	addq	%r12, %rdx
	.p2align	4, 0x90
.LBB3_5:                                # %polly.loop_header
                                        #   Parent Loop BB3_4 Depth=1
                                        # =>  This Loop Header: Depth=2
                                        #       Child Loop BB3_6 Depth 3
	movq	$-6144, %rdi            # imm = 0xE800
	xorl	%ecx, %ecx
	.p2align	4, 0x90
.LBB3_6:                                # %polly.loop_header2
                                        #   Parent Loop BB3_4 Depth=1
                                        #     Parent Loop BB3_5 Depth=2
                                        # =>    This Inner Loop Header: Depth=3
	movl	%ecx, %ebx
	andl	$1023, %ebx             # imm = 0x3FF
	incl	%ebx
	xorps	%xmm0, %xmm0
	cvtsi2sdl	%ebx, %xmm0
	mulsd	%xmm1, %xmm0
	cvtsd2ss	%xmm0, %xmm0
	movss	%xmm0, 6144(%rdx,%rdi)
	movss	%xmm0, 6144(%rsi,%rdi)
	addl	%eax, %ecx
	addq	$4, %rdi
	jne	.LBB3_6
# %bb.7:                                # %polly.loop_exit4
                                        #   in Loop: Header=BB3_5 Depth=2
	addq	$6144, %rsi             # imm = 0x1800
	addq	$6144, %rdx             # imm = 0x1800
	cmpq	%r8, %rax
	leaq	1(%rax), %rax
	jl	.LBB3_5
# %bb.3:                                # %polly.par.checkNext.loopexit
                                        #   in Loop: Header=BB3_4 Depth=1
	movq	%r14, %rdi
	movq	%r13, %rsi
	callq	GOMP_loop_runtime_next@PLT
	movsd	.LCPI3_0(%rip), %xmm1   # xmm1 = mem[0],zero
	testb	%al, %al
	jne	.LBB3_4
.LBB3_2:                                # %polly.par.exit
	callq	GOMP_loop_end_nowait@PLT
	addq	$16, %rsp
	.cfi_def_cfa_offset 48
	popq	%rbx
	.cfi_def_cfa_offset 40
	popq	%r12
	.cfi_def_cfa_offset 32
	popq	%r13
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	retq
.Lfunc_end3:
	.size	init_array_polly_subfn, .Lfunc_end3-init_array_polly_subfn
	.cfi_endproc
                                        # -- End function
	.p2align	4, 0x90         # -- Begin function main_polly_subfn
	.type	main_polly_subfn,@function
main_polly_subfn:                       # @main_polly_subfn
	.cfi_startproc
# %bb.0:                                # %polly.par.setup
	pushq	%r15
	.cfi_def_cfa_offset 16
	pushq	%r14
	.cfi_def_cfa_offset 24
	pushq	%rbx
	.cfi_def_cfa_offset 32
	subq	$16, %rsp
	.cfi_def_cfa_offset 48
	.cfi_offset %rbx, -32
	.cfi_offset %r14, -24
	.cfi_offset %r15, -16
	leaq	8(%rsp), %rdi
	movq	%rsp, %rsi
	callq	GOMP_loop_runtime_next@PLT
	testb	%al, %al
	je	.LBB4_3
# %bb.1:
	leaq	C(%rip), %r15
	leaq	8(%rsp), %r14
	movq	%rsp, %rbx
	.p2align	4, 0x90
.LBB4_2:                                # %polly.par.loadIVBounds
                                        # =>This Inner Loop Header: Depth=1
	movq	8(%rsp), %rax
	movq	(%rsp), %rcx
	decq	%rcx
	leaq	(%rax,%rax,2), %rdi
	shlq	$11, %rdi
	addq	%r15, %rdi
	cmpq	%rcx, %rax
	cmovgeq	%rax, %rcx
	incq	%rcx
	subq	%rax, %rcx
	shlq	$11, %rcx
	leaq	(%rcx,%rcx,2), %rdx
	xorl	%esi, %esi
	callq	memset@PLT
	movq	%r14, %rdi
	movq	%rbx, %rsi
	callq	GOMP_loop_runtime_next@PLT
	testb	%al, %al
	jne	.LBB4_2
.LBB4_3:                                # %polly.par.exit
	callq	GOMP_loop_end_nowait@PLT
	addq	$16, %rsp
	.cfi_def_cfa_offset 32
	popq	%rbx
	.cfi_def_cfa_offset 24
	popq	%r14
	.cfi_def_cfa_offset 16
	popq	%r15
	.cfi_def_cfa_offset 8
	retq
.Lfunc_end4:
	.size	main_polly_subfn, .Lfunc_end4-main_polly_subfn
	.cfi_endproc
                                        # -- End function
	.p2align	4, 0x90         # -- Begin function main_polly_subfn_1
	.type	main_polly_subfn_1,@function
main_polly_subfn_1:                     # @main_polly_subfn_1
	.cfi_startproc
# %bb.0:                                # %polly.par.setup
	pushq	%rbp
	.cfi_def_cfa_offset 16
	pushq	%r15
	.cfi_def_cfa_offset 24
	pushq	%r14
	.cfi_def_cfa_offset 32
	pushq	%r13
	.cfi_def_cfa_offset 40
	pushq	%r12
	.cfi_def_cfa_offset 48
	pushq	%rbx
	.cfi_def_cfa_offset 56
	subq	$296, %rsp              # imm = 0x128
	.cfi_def_cfa_offset 352
	.cfi_offset %rbx, -56
	.cfi_offset %r12, -48
	.cfi_offset %r13, -40
	.cfi_offset %r14, -32
	.cfi_offset %r15, -24
	.cfi_offset %rbp, -16
	jmp	.LBB5_1
	.p2align	4, 0x90
.LBB5_2:                                # %polly.par.loadIVBounds
                                        #   in Loop: Header=BB5_1 Depth=1
	movq	40(%rsp), %rdx
	movq	32(%rsp), %rax
	decq	%rax
	movq	%rax, 136(%rsp)         # 8-byte Spill
	leaq	(%rdx,%rdx,2), %rcx
	shlq	$11, %rcx
	leaq	A(%rip), %rax
	addq	%rax, %rcx
	movq	%rcx, 24(%rsp)          # 8-byte Spill
	.p2align	4, 0x90
.LBB5_3:                                # %polly.loop_header
                                        #   Parent Loop BB5_1 Depth=1
                                        # =>  This Loop Header: Depth=2
                                        #       Child Loop BB5_4 Depth 3
                                        #         Child Loop BB5_5 Depth 4
                                        #           Child Loop BB5_6 Depth 5
                                        #             Child Loop BB5_7 Depth 6
	leaq	63(%rdx), %rsi
	leaq	B+192(%rip), %r14
	xorl	%ecx, %ecx
	xorl	%eax, %eax
	movq	%rdx, 168(%rsp)         # 8-byte Spill
	.p2align	4, 0x90
.LBB5_4:                                # %polly.loop_header2
                                        #   Parent Loop BB5_1 Depth=1
                                        #     Parent Loop BB5_3 Depth=2
                                        # =>    This Loop Header: Depth=3
                                        #         Child Loop BB5_5 Depth 4
                                        #           Child Loop BB5_6 Depth 5
                                        #             Child Loop BB5_7 Depth 6
	movq	%rax, 144(%rsp)         # 8-byte Spill
	movq	%rcx, 152(%rsp)         # 8-byte Spill
	shlq	$6, %rcx
	leaq	16(%rcx), %rdi
	leaq	32(%rcx), %rbp
	leaq	48(%rcx), %r15
	movq	24(%rsp), %r9           # 8-byte Reload
	movq	%r14, 160(%rsp)         # 8-byte Spill
	xorl	%eax, %eax
	.p2align	4, 0x90
.LBB5_5:                                # %polly.loop_header8
                                        #   Parent Loop BB5_1 Depth=1
                                        #     Parent Loop BB5_3 Depth=2
                                        #       Parent Loop BB5_4 Depth=3
                                        # =>      This Loop Header: Depth=4
                                        #           Child Loop BB5_6 Depth 5
                                        #             Child Loop BB5_7 Depth 6
	movq	%rax, 176(%rsp)         # 8-byte Spill
	movq	%r9, 184(%rsp)          # 8-byte Spill
	movq	%rdx, %rax
	.p2align	4, 0x90
.LBB5_6:                                # %polly.loop_header14
                                        #   Parent Loop BB5_1 Depth=1
                                        #     Parent Loop BB5_3 Depth=2
                                        #       Parent Loop BB5_4 Depth=3
                                        #         Parent Loop BB5_5 Depth=4
                                        # =>        This Loop Header: Depth=5
                                        #             Child Loop BB5_7 Depth 6
	leaq	(%rax,%rax,2), %rbx
	shlq	$11, %rbx
	leaq	C(%rip), %rdx
	addq	%rdx, %rbx
	leaq	(%rbx,%rcx,4), %r8
	leaq	(%rbx,%rdi,4), %rdx
	leaq	(%rbx,%rbp,4), %r13
	leaq	(%rbx,%r15,4), %r10
	movups	(%rbx,%rcx,4), %xmm8
	movups	16(%rbx,%rcx,4), %xmm0
	movaps	%xmm0, 96(%rsp)         # 16-byte Spill
	movups	32(%rbx,%rcx,4), %xmm6
	movups	48(%rbx,%rcx,4), %xmm1
	movups	(%rbx,%rdi,4), %xmm15
	movups	16(%rbx,%rdi,4), %xmm0
	movaps	%xmm0, (%rsp)           # 16-byte Spill
	movups	32(%rbx,%rdi,4), %xmm0
	movaps	%xmm0, 48(%rsp)         # 16-byte Spill
	movups	48(%rbx,%rdi,4), %xmm0
	movaps	%xmm0, 64(%rsp)         # 16-byte Spill
	movups	(%rbx,%rbp,4), %xmm11
	movups	16(%rbx,%rbp,4), %xmm0
	movaps	%xmm0, 112(%rsp)        # 16-byte Spill
	movups	32(%rbx,%rbp,4), %xmm12
	movups	48(%rbx,%rbp,4), %xmm0
	movaps	%xmm0, 80(%rsp)         # 16-byte Spill
	movups	(%rbx,%r15,4), %xmm9
	movups	16(%rbx,%r15,4), %xmm13
	movups	32(%rbx,%r15,4), %xmm2
	movups	48(%rbx,%r15,4), %xmm3
	movq	$-256, %r12
	movq	%r14, %r11
	.p2align	4, 0x90
.LBB5_7:                                # %vector.ph
                                        #   Parent Loop BB5_1 Depth=1
                                        #     Parent Loop BB5_3 Depth=2
                                        #       Parent Loop BB5_4 Depth=3
                                        #         Parent Loop BB5_5 Depth=4
                                        #           Parent Loop BB5_6 Depth=5
                                        # =>          This Inner Loop Header: Depth=6
	movaps	%xmm12, 208(%rsp)       # 16-byte Spill
	movaps	%xmm2, 224(%rsp)        # 16-byte Spill
	movaps	%xmm3, 240(%rsp)        # 16-byte Spill
	movaps	%xmm8, %xmm10
	movaps	96(%rsp), %xmm7         # 16-byte Reload
	unpcklps	%xmm7, %xmm10   # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
	movaps	%xmm1, %xmm4
	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
	shufps	$36, %xmm4, %xmm10      # xmm10 = xmm10[0,1],xmm4[2,0]
	movaps	%xmm7, %xmm5
	shufps	$17, %xmm8, %xmm5       # xmm5 = xmm5[1,0],xmm8[1,0]
	movaps	%xmm6, %xmm4
	unpcklps	%xmm1, %xmm4    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	shufps	$226, %xmm4, %xmm5      # xmm5 = xmm5[2,0],xmm4[2,3]
	movaps	%xmm8, %xmm12
	unpckhps	%xmm7, %xmm12   # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
	movaps	%xmm1, %xmm4
	shufps	$34, %xmm6, %xmm4       # xmm4 = xmm4[2,0],xmm6[2,0]
	shufps	$36, %xmm4, %xmm12      # xmm12 = xmm12[0,1],xmm4[2,0]
	shufps	$51, %xmm8, %xmm7       # xmm7 = xmm7[3,0],xmm8[3,0]
	unpckhps	%xmm1, %xmm6    # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
	shufps	$226, %xmm6, %xmm7      # xmm7 = xmm7[2,0],xmm6[2,3]
	movaps	-160(%r11), %xmm0
	movaps	-144(%r11), %xmm1
	movaps	%xmm1, %xmm6
	shufps	$0, %xmm0, %xmm6        # xmm6 = xmm6[0,0],xmm0[0,0]
	movaps	-192(%r11), %xmm3
	movaps	-176(%r11), %xmm4
	movaps	%xmm3, %xmm8
	unpcklps	%xmm4, %xmm8    # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
	shufps	$36, %xmm6, %xmm8       # xmm8 = xmm8[0,1],xmm6[2,0]
	movaps	%xmm0, %xmm2
	unpcklps	%xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	movaps	%xmm4, %xmm6
	shufps	$17, %xmm3, %xmm6       # xmm6 = xmm6[1,0],xmm3[1,0]
	shufps	$226, %xmm2, %xmm6      # xmm6 = xmm6[2,0],xmm2[2,3]
	movaps	%xmm1, %xmm2
	shufps	$34, %xmm0, %xmm2       # xmm2 = xmm2[2,0],xmm0[2,0]
	movaps	%xmm3, %xmm14
	unpckhps	%xmm4, %xmm14   # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
	shufps	$36, %xmm2, %xmm14      # xmm14 = xmm14[0,1],xmm2[2,0]
	unpckhps	%xmm1, %xmm0    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
	shufps	$51, %xmm3, %xmm4       # xmm4 = xmm4[3,0],xmm3[3,0]
	shufps	$226, %xmm0, %xmm4      # xmm4 = xmm4[2,0],xmm0[2,3]
	movss	256(%r9,%r12), %xmm0    # xmm0 = mem[0],zero,zero,zero
	shufps	$0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
	mulps	%xmm0, %xmm8
	addps	%xmm10, %xmm8
	mulps	%xmm0, %xmm6
	addps	%xmm5, %xmm6
	mulps	%xmm0, %xmm14
	addps	%xmm12, %xmm14
	mulps	%xmm0, %xmm4
	movaps	%xmm0, %xmm5
	addps	%xmm7, %xmm4
	movaps	%xmm14, %xmm0
	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
	movaps	%xmm6, %xmm1
	shufps	$51, %xmm8, %xmm1       # xmm1 = xmm1[3,0],xmm8[3,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, 272(%rsp)        # 16-byte Spill
	movaps	%xmm4, %xmm0
	shufps	$34, %xmm14, %xmm0      # xmm0 = xmm0[2,0],xmm14[2,0]
	movaps	%xmm8, %xmm1
	unpckhps	%xmm6, %xmm1    # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
	shufps	$36, %xmm0, %xmm1       # xmm1 = xmm1[0,1],xmm0[2,0]
	movaps	%xmm1, 256(%rsp)        # 16-byte Spill
	movaps	%xmm14, %xmm0
	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
	movaps	%xmm6, %xmm1
	shufps	$17, %xmm8, %xmm1       # xmm1 = xmm1[1,0],xmm8[1,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, 96(%rsp)         # 16-byte Spill
	shufps	$0, %xmm14, %xmm4       # xmm4 = xmm4[0,0],xmm14[0,0]
	unpcklps	%xmm6, %xmm8    # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
	shufps	$36, %xmm4, %xmm8       # xmm8 = xmm8[0,1],xmm4[2,0]
	movaps	%xmm15, %xmm14
	movaps	(%rsp), %xmm4           # 16-byte Reload
	unpcklps	%xmm4, %xmm14   # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
	movaps	64(%rsp), %xmm1         # 16-byte Reload
	movaps	%xmm1, %xmm0
	movaps	48(%rsp), %xmm3         # 16-byte Reload
	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
	shufps	$36, %xmm0, %xmm14      # xmm14 = xmm14[0,1],xmm0[2,0]
	movaps	%xmm4, %xmm12
	shufps	$17, %xmm15, %xmm12     # xmm12 = xmm12[1,0],xmm15[1,0]
	movaps	%xmm3, %xmm2
	unpcklps	%xmm1, %xmm2    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
	shufps	$226, %xmm2, %xmm12     # xmm12 = xmm12[2,0],xmm2[2,3]
	movaps	%xmm15, %xmm7
	unpckhps	%xmm4, %xmm7    # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
	movaps	%xmm1, %xmm2
	shufps	$34, %xmm3, %xmm2       # xmm2 = xmm2[2,0],xmm3[2,0]
	shufps	$36, %xmm2, %xmm7       # xmm7 = xmm7[0,1],xmm2[2,0]
	shufps	$51, %xmm15, %xmm4      # xmm4 = xmm4[3,0],xmm15[3,0]
	unpckhps	%xmm1, %xmm3    # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
	shufps	$226, %xmm3, %xmm4      # xmm4 = xmm4[2,0],xmm3[2,3]
	movaps	%xmm4, (%rsp)           # 16-byte Spill
	movaps	-96(%r11), %xmm2
	movaps	-80(%r11), %xmm1
	movaps	%xmm1, %xmm4
	shufps	$0, %xmm2, %xmm4        # xmm4 = xmm4[0,0],xmm2[0,0]
	movaps	-112(%r11), %xmm10
	movaps	-128(%r11), %xmm0
	movaps	%xmm0, %xmm15
	unpcklps	%xmm10, %xmm15  # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
	shufps	$36, %xmm4, %xmm15      # xmm15 = xmm15[0,1],xmm4[2,0]
	movaps	%xmm2, %xmm4
	unpcklps	%xmm1, %xmm4    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
	movaps	%xmm10, %xmm6
	shufps	$17, %xmm0, %xmm6       # xmm6 = xmm6[1,0],xmm0[1,0]
	shufps	$226, %xmm4, %xmm6      # xmm6 = xmm6[2,0],xmm4[2,3]
	movaps	%xmm1, %xmm3
	shufps	$34, %xmm2, %xmm3       # xmm3 = xmm3[2,0],xmm2[2,0]
	movaps	%xmm0, %xmm4
	unpckhps	%xmm10, %xmm4   # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
	shufps	$36, %xmm3, %xmm4       # xmm4 = xmm4[0,1],xmm3[2,0]
	unpckhps	%xmm1, %xmm2    # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
	shufps	$51, %xmm0, %xmm10      # xmm10 = xmm10[3,0],xmm0[3,0]
	shufps	$226, %xmm2, %xmm10     # xmm10 = xmm10[2,0],xmm2[2,3]
	movaps	%xmm5, 192(%rsp)        # 16-byte Spill
	mulps	%xmm5, %xmm15
	addps	%xmm14, %xmm15
	mulps	%xmm5, %xmm6
	addps	%xmm12, %xmm6
	mulps	%xmm5, %xmm4
	addps	%xmm7, %xmm4
	mulps	%xmm5, %xmm10
	addps	(%rsp), %xmm10          # 16-byte Folded Reload
	movaps	%xmm4, %xmm0
	unpckhps	%xmm10, %xmm0   # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
	movaps	%xmm6, %xmm1
	shufps	$51, %xmm15, %xmm1      # xmm1 = xmm1[3,0],xmm15[3,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, 64(%rsp)         # 16-byte Spill
	movaps	%xmm10, %xmm0
	shufps	$34, %xmm4, %xmm0       # xmm0 = xmm0[2,0],xmm4[2,0]
	movaps	%xmm15, %xmm1
	unpckhps	%xmm6, %xmm1    # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
	shufps	$36, %xmm0, %xmm1       # xmm1 = xmm1[0,1],xmm0[2,0]
	movaps	%xmm1, 48(%rsp)         # 16-byte Spill
	movaps	%xmm4, %xmm0
	unpcklps	%xmm10, %xmm0   # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
	movaps	%xmm6, %xmm1
	shufps	$17, %xmm15, %xmm1      # xmm1 = xmm1[1,0],xmm15[1,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, (%rsp)           # 16-byte Spill
	shufps	$0, %xmm4, %xmm10       # xmm10 = xmm10[0,0],xmm4[0,0]
	unpcklps	%xmm6, %xmm15   # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
	shufps	$36, %xmm10, %xmm15     # xmm15 = xmm15[0,1],xmm10[2,0]
	movaps	%xmm11, %xmm10
	movaps	112(%rsp), %xmm14       # 16-byte Reload
	unpcklps	%xmm14, %xmm10  # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
	movaps	80(%rsp), %xmm2         # 16-byte Reload
	movaps	%xmm2, %xmm0
	movaps	208(%rsp), %xmm3        # 16-byte Reload
	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
	shufps	$36, %xmm0, %xmm10      # xmm10 = xmm10[0,1],xmm0[2,0]
	movaps	%xmm14, %xmm12
	shufps	$17, %xmm11, %xmm12     # xmm12 = xmm12[1,0],xmm11[1,0]
	movaps	%xmm3, %xmm0
	unpcklps	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	shufps	$226, %xmm0, %xmm12     # xmm12 = xmm12[2,0],xmm0[2,3]
	movaps	%xmm11, %xmm0
	unpckhps	%xmm14, %xmm0   # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
	movaps	%xmm2, %xmm1
	shufps	$34, %xmm3, %xmm1       # xmm1 = xmm1[2,0],xmm3[2,0]
	shufps	$36, %xmm1, %xmm0       # xmm0 = xmm0[0,1],xmm1[2,0]
	shufps	$51, %xmm11, %xmm14     # xmm14 = xmm14[3,0],xmm11[3,0]
	unpckhps	%xmm2, %xmm3    # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	shufps	$226, %xmm3, %xmm14     # xmm14 = xmm14[2,0],xmm3[2,3]
	movaps	-32(%r11), %xmm1
	movaps	-16(%r11), %xmm2
	movaps	%xmm2, %xmm3
	shufps	$0, %xmm1, %xmm3        # xmm3 = xmm3[0,0],xmm1[0,0]
	movaps	-48(%r11), %xmm4
	movaps	-64(%r11), %xmm5
	movaps	%xmm5, %xmm11
	unpcklps	%xmm4, %xmm11   # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
	shufps	$36, %xmm3, %xmm11      # xmm11 = xmm11[0,1],xmm3[2,0]
	movaps	%xmm1, %xmm3
	unpcklps	%xmm2, %xmm3    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	movaps	%xmm4, %xmm7
	shufps	$17, %xmm5, %xmm7       # xmm7 = xmm7[1,0],xmm5[1,0]
	shufps	$226, %xmm3, %xmm7      # xmm7 = xmm7[2,0],xmm3[2,3]
	movaps	%xmm2, %xmm3
	shufps	$34, %xmm1, %xmm3       # xmm3 = xmm3[2,0],xmm1[2,0]
	movaps	%xmm5, %xmm6
	unpckhps	%xmm4, %xmm6    # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
	shufps	$36, %xmm3, %xmm6       # xmm6 = xmm6[0,1],xmm3[2,0]
	unpckhps	%xmm2, %xmm1    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	shufps	$51, %xmm5, %xmm4       # xmm4 = xmm4[3,0],xmm5[3,0]
	shufps	$226, %xmm1, %xmm4      # xmm4 = xmm4[2,0],xmm1[2,3]
	movaps	192(%rsp), %xmm1        # 16-byte Reload
	mulps	%xmm1, %xmm11
	addps	%xmm10, %xmm11
	mulps	%xmm1, %xmm7
	addps	%xmm12, %xmm7
	mulps	%xmm1, %xmm6
	addps	%xmm0, %xmm6
	mulps	%xmm1, %xmm4
	addps	%xmm14, %xmm4
	movaps	%xmm6, %xmm0
	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
	movaps	%xmm7, %xmm1
	shufps	$51, %xmm11, %xmm1      # xmm1 = xmm1[3,0],xmm11[3,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, 80(%rsp)         # 16-byte Spill
	movaps	%xmm4, %xmm0
	shufps	$34, %xmm6, %xmm0       # xmm0 = xmm0[2,0],xmm6[2,0]
	movaps	%xmm11, %xmm12
	unpckhps	%xmm7, %xmm12   # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
	shufps	$36, %xmm0, %xmm12      # xmm12 = xmm12[0,1],xmm0[2,0]
	movaps	%xmm6, %xmm0
	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
	movaps	%xmm7, %xmm1
	shufps	$17, %xmm11, %xmm1      # xmm1 = xmm1[1,0],xmm11[1,0]
	shufps	$226, %xmm0, %xmm1      # xmm1 = xmm1[2,0],xmm0[2,3]
	movaps	%xmm1, 112(%rsp)        # 16-byte Spill
	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
	unpcklps	%xmm7, %xmm11   # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
	shufps	$36, %xmm4, %xmm11      # xmm11 = xmm11[0,1],xmm4[2,0]
	movaps	%xmm9, %xmm10
	unpcklps	%xmm13, %xmm10  # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
	movaps	240(%rsp), %xmm2        # 16-byte Reload
	movaps	%xmm2, %xmm0
	movaps	224(%rsp), %xmm3        # 16-byte Reload
	shufps	$0, %xmm3, %xmm0        # xmm0 = xmm0[0,0],xmm3[0,0]
	shufps	$36, %xmm0, %xmm10      # xmm10 = xmm10[0,1],xmm0[2,0]
	movaps	%xmm13, %xmm14
	shufps	$17, %xmm9, %xmm14      # xmm14 = xmm14[1,0],xmm9[1,0]
	movaps	%xmm3, %xmm0
	unpcklps	%xmm2, %xmm0    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	shufps	$226, %xmm0, %xmm14     # xmm14 = xmm14[2,0],xmm0[2,3]
	movaps	%xmm9, %xmm0
	unpckhps	%xmm13, %xmm0   # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
	movaps	%xmm2, %xmm1
	shufps	$34, %xmm3, %xmm1       # xmm1 = xmm1[2,0],xmm3[2,0]
	shufps	$36, %xmm1, %xmm0       # xmm0 = xmm0[0,1],xmm1[2,0]
	shufps	$51, %xmm9, %xmm13      # xmm13 = xmm13[3,0],xmm9[3,0]
	unpckhps	%xmm2, %xmm3    # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
	shufps	$226, %xmm3, %xmm13     # xmm13 = xmm13[2,0],xmm3[2,3]
	movaps	32(%r11), %xmm1
	movaps	48(%r11), %xmm2
	movaps	%xmm2, %xmm3
	shufps	$0, %xmm1, %xmm3        # xmm3 = xmm3[0,0],xmm1[0,0]
	movaps	16(%r11), %xmm4
	movaps	(%r11), %xmm5
	movaps	%xmm5, %xmm9
	unpcklps	%xmm4, %xmm9    # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
	shufps	$36, %xmm3, %xmm9       # xmm9 = xmm9[0,1],xmm3[2,0]
	movaps	%xmm1, %xmm3
	unpcklps	%xmm2, %xmm3    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
	movaps	%xmm4, %xmm7
	shufps	$17, %xmm5, %xmm7       # xmm7 = xmm7[1,0],xmm5[1,0]
	shufps	$226, %xmm3, %xmm7      # xmm7 = xmm7[2,0],xmm3[2,3]
	movaps	%xmm2, %xmm3
	shufps	$34, %xmm1, %xmm3       # xmm3 = xmm3[2,0],xmm1[2,0]
	movaps	%xmm5, %xmm6
	unpckhps	%xmm4, %xmm6    # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
	shufps	$36, %xmm3, %xmm6       # xmm6 = xmm6[0,1],xmm3[2,0]
	unpckhps	%xmm2, %xmm1    # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
	shufps	$51, %xmm5, %xmm4       # xmm4 = xmm4[3,0],xmm5[3,0]
	shufps	$226, %xmm1, %xmm4      # xmm4 = xmm4[2,0],xmm1[2,3]
	movaps	192(%rsp), %xmm1        # 16-byte Reload
	mulps	%xmm1, %xmm9
	addps	%xmm10, %xmm9
	mulps	%xmm1, %xmm7
	addps	%xmm14, %xmm7
	mulps	%xmm1, %xmm6
	addps	%xmm0, %xmm6
	mulps	%xmm1, %xmm4
	addps	%xmm13, %xmm4
	movaps	%xmm6, %xmm0
	unpckhps	%xmm4, %xmm0    # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
	movaps	%xmm7, %xmm3
	shufps	$51, %xmm9, %xmm3       # xmm3 = xmm3[3,0],xmm9[3,0]
	shufps	$226, %xmm0, %xmm3      # xmm3 = xmm3[2,0],xmm0[2,3]
	movaps	%xmm4, %xmm0
	shufps	$34, %xmm6, %xmm0       # xmm0 = xmm0[2,0],xmm6[2,0]
	movaps	%xmm9, %xmm2
	unpckhps	%xmm7, %xmm2    # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
	shufps	$36, %xmm0, %xmm2       # xmm2 = xmm2[0,1],xmm0[2,0]
	movaps	%xmm6, %xmm0
	unpcklps	%xmm4, %xmm0    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
	movaps	%xmm7, %xmm13
	shufps	$17, %xmm9, %xmm13      # xmm13 = xmm13[1,0],xmm9[1,0]
	shufps	$226, %xmm0, %xmm13     # xmm13 = xmm13[2,0],xmm0[2,3]
	shufps	$0, %xmm6, %xmm4        # xmm4 = xmm4[0,0],xmm6[0,0]
	movaps	256(%rsp), %xmm6        # 16-byte Reload
	movaps	272(%rsp), %xmm1        # 16-byte Reload
	unpcklps	%xmm7, %xmm9    # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
	shufps	$36, %xmm4, %xmm9       # xmm9 = xmm9[0,1],xmm4[2,0]
	addq	$6144, %r11             # imm = 0x1800
	addq	$4, %r12
	jne	.LBB5_7
# %bb.8:                                # %polly.loop_exit22
                                        #   in Loop: Header=BB5_6 Depth=5
	movups	%xmm8, (%r8)
	movaps	96(%rsp), %xmm0         # 16-byte Reload
	movups	%xmm0, 16(%r8)
	movups	%xmm6, 32(%r8)
	movups	%xmm1, 48(%r8)
	movaps	64(%rsp), %xmm0         # 16-byte Reload
	movups	%xmm0, 48(%rdx)
	movaps	48(%rsp), %xmm0         # 16-byte Reload
	movups	%xmm0, 32(%rdx)
	movaps	(%rsp), %xmm0           # 16-byte Reload
	movups	%xmm0, 16(%rdx)
	movups	%xmm15, (%rdx)
	movaps	80(%rsp), %xmm0         # 16-byte Reload
	movups	%xmm0, 48(%r13)
	movaps	112(%rsp), %xmm0        # 16-byte Reload
	movups	%xmm0, 16(%r13)
	movups	%xmm11, (%r13)
	movups	%xmm12, 32(%r13)
	movups	%xmm3, 48(%r10)
	movups	%xmm13, 16(%r10)
	movups	%xmm9, (%r10)
	movups	%xmm2, 32(%r10)
	addq	$6144, %r9              # imm = 0x1800
	cmpq	%rsi, %rax
	leaq	1(%rax), %rax
	jl	.LBB5_6
# %bb.9:                                # %polly.loop_exit16
                                        #   in Loop: Header=BB5_5 Depth=4
	movq	176(%rsp), %rax         # 8-byte Reload
	addq	$64, %rax
	addq	$393216, %r14           # imm = 0x60000
	movq	184(%rsp), %r9          # 8-byte Reload
	addq	$256, %r9               # imm = 0x100
	cmpq	$1536, %rax             # imm = 0x600
	movq	168(%rsp), %rdx         # 8-byte Reload
	jb	.LBB5_5
# %bb.10:                               # %polly.loop_exit10
                                        #   in Loop: Header=BB5_4 Depth=3
	movq	144(%rsp), %rax         # 8-byte Reload
	addq	$64, %rax
	movq	152(%rsp), %rcx         # 8-byte Reload
	incq	%rcx
	movq	160(%rsp), %r14         # 8-byte Reload
	addq	$256, %r14              # imm = 0x100
	cmpq	$1536, %rax             # imm = 0x600
	jb	.LBB5_4
# %bb.11:                               # %polly.loop_exit4
                                        #   in Loop: Header=BB5_3 Depth=2
	addq	$64, %rdx
	addq	$393216, 24(%rsp)       # 8-byte Folded Spill
                                        # imm = 0x60000
	cmpq	136(%rsp), %rdx         # 8-byte Folded Reload
	jle	.LBB5_3
.LBB5_1:                                # %polly.par.setup
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB5_3 Depth 2
                                        #       Child Loop BB5_4 Depth 3
                                        #         Child Loop BB5_5 Depth 4
                                        #           Child Loop BB5_6 Depth 5
                                        #             Child Loop BB5_7 Depth 6
	leaq	40(%rsp), %rdi
	leaq	32(%rsp), %rsi
	callq	GOMP_loop_runtime_next@PLT
	testb	%al, %al
	jne	.LBB5_2
# %bb.12:                               # %polly.par.exit
	callq	GOMP_loop_end_nowait@PLT
	addq	$296, %rsp              # imm = 0x128
	.cfi_def_cfa_offset 56
	popq	%rbx
	.cfi_def_cfa_offset 48
	popq	%r12
	.cfi_def_cfa_offset 40
	popq	%r13
	.cfi_def_cfa_offset 32
	popq	%r14
	.cfi_def_cfa_offset 24
	popq	%r15
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	retq
.Lfunc_end5:
	.size	main_polly_subfn_1, .Lfunc_end5-main_polly_subfn_1
	.cfi_endproc
                                        # -- End function
	.type	A,@object               # @A
	.comm	A,9437184,16
	.type	B,@object               # @B
	.comm	B,9437184,16
	.type	.L.str,@object          # @.str
	.section	.rodata.str1.1,"aMS",@progbits,1
.L.str:
	.asciz	"%lf "
	.size	.L.str, 5

	.type	C,@object               # @C
	.comm	C,9437184,16

	.ident	"clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
	.section	".note.GNU-stack","",@progbits