llvm/llvm/test/CodeGen/X86/matrix-multiply.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL

;
; Basic matrix multiply tests based on the pattern:
;
; using matrix_ty = float __attribute__((matrix_type(2,2)));
; matrix_ty test_mul2x2(matrix_ty a0, matrix_ty a1) nounwind {
;     return a0 * a1;
; }
;

define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind {
; SSE-LABEL: test_mul2x2_f32:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movaps %xmm1, %xmm2
; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT:    mulps %xmm0, %xmm2
; SSE-NEXT:    movaps %xmm1, %xmm3
; SSE-NEXT:    unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    movaps %xmm0, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
; SSE-NEXT:    movaps %xmm1, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
; SSE-NEXT:    mulps %xmm4, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE-NEXT:    mulps %xmm4, %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul2x2_f32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-NEXT:    vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
; AVX1-NEXT:    vmulps %xmm3, %xmm0, %xmm3
; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX1-NEXT:    vmulps %xmm4, %xmm2, %xmm4
; AVX1-NEXT:    vaddps %xmm4, %xmm3, %xmm3
; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX1-NEXT:    vmulps %xmm4, %xmm0, %xmm0
; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul2x2_f32:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX2-NEXT:    vbroadcastss %xmm1, %xmm3
; AVX2-NEXT:    vmulps %xmm3, %xmm0, %xmm3
; AVX2-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX2-NEXT:    vmulps %xmm4, %xmm2, %xmm4
; AVX2-NEXT:    vaddps %xmm4, %xmm3, %xmm3
; AVX2-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX2-NEXT:    vmulps %xmm4, %xmm0, %xmm0
; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-NEXT:    vmulps %xmm1, %xmm2, %xmm1
; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX2-NEXT:    retq
;
; AVX512-LABEL: test_mul2x2_f32:
; AVX512:       # %bb.0: # %entry
; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT:    vbroadcastss %xmm1, %xmm3
; AVX512-NEXT:    vmulps %xmm3, %xmm0, %xmm3
; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX512-NEXT:    vmulps %xmm4, %xmm2, %xmm4
; AVX512-NEXT:    vaddps %xmm4, %xmm3, %xmm3
; AVX512-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX512-NEXT:    vmulps %xmm4, %xmm0, %xmm0
; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX512-NEXT:    retq
entry:
  %split = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 0, i32 1>
  %split1 = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> <i32 2, i32 3>
  %splat.splat = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> zeroinitializer
  %0 = fmul <2 x float> %split, %splat.splat
  %splat.splat6 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 1, i32 1>
  %1 = fmul <2 x float> %split1, %splat.splat6
  %2 = fadd <2 x float> %0, %1
  %splat.splat9 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 2, i32 2>
  %3 = fmul <2 x float> %split, %splat.splat9
  %splat.splat12 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 3, i32 3>
  %4 = fmul <2 x float> %split1, %splat.splat12
  %5 = fadd <2 x float> %3, %4
  %6 = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x float> %6
}

define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwind {
; SSE-LABEL: test_mul2x2_f64:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movapd %xmm2, %xmm4
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm2[0]
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm2, %xmm4
; SSE-NEXT:    movapd %xmm3, %xmm2
; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm2, %xmm1
; SSE-NEXT:    movapd %xmm4, %xmm0
; SSE-NEXT:    retq
;
; AVX-LABEL: test_mul2x2_f64:
; AVX:       # %bb.0: # %entry
; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm1[0,0]
; AVX-NEXT:    vmulpd %xmm3, %xmm0, %xmm3
; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,1]
; AVX-NEXT:    vmulpd %xmm4, %xmm2, %xmm4
; AVX-NEXT:    vaddpd %xmm4, %xmm3, %xmm3
; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
; AVX-NEXT:    vmovddup {{.*#+}} xmm4 = xmm1[0,0]
; AVX-NEXT:    vmulpd %xmm4, %xmm0, %xmm0
; AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,1]
; AVX-NEXT:    vmulpd %xmm1, %xmm2, %xmm1
; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT:    retq
entry:
  %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1>
  %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3>
  %splat.splat = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> zeroinitializer
  %0 = fmul <2 x double> %split, %splat.splat
  %splat.splat6 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 1, i32 1>
  %1 = fmul <2 x double> %split1, %splat.splat6
  %2 = fadd <2 x double> %0, %1
  %splat.splat9 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 2, i32 2>
  %3 = fmul <2 x double> %split, %splat.splat9
  %splat.splat12 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> <i32 3, i32 3>
  %4 = fmul <2 x double> %split1, %splat.splat12
  %5 = fadd <2 x double> %3, %4
  %6 = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x double> %6
}

define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
; SSE-LABEL: test_mul3x3_f32:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
; SSE-NEXT:    movss {{.*#+}} xmm10 = mem[0],zero,zero,zero
; SSE-NEXT:    movss {{.*#+}} xmm9 = mem[0],zero,zero,zero
; SSE-NEXT:    movss {{.*#+}} xmm11 = mem[0],zero,zero,zero
; SSE-NEXT:    movss {{.*#+}} xmm12 = mem[0],zero,zero,zero
; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT:    movaps %xmm2, %xmm13
; SSE-NEXT:    mulss %xmm12, %xmm13
; SSE-NEXT:    unpcklps {{.*#+}} xmm12 = xmm12[0,0,1,1]
; SSE-NEXT:    mulps %xmm0, %xmm12
; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT:    movaps %xmm5, %xmm1
; SSE-NEXT:    mulss %xmm11, %xmm1
; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1]
; SSE-NEXT:    mulps %xmm3, %xmm11
; SSE-NEXT:    addps %xmm12, %xmm11
; SSE-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; SSE-NEXT:    unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; SSE-NEXT:    movaps %xmm9, %xmm7
; SSE-NEXT:    mulss %xmm4, %xmm7
; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0,0,1,1]
; SSE-NEXT:    mulps %xmm6, %xmm4
; SSE-NEXT:    addps %xmm11, %xmm4
; SSE-NEXT:    movss {{.*#+}} xmm11 = mem[0],zero,zero,zero
; SSE-NEXT:    addss %xmm13, %xmm1
; SSE-NEXT:    addss %xmm7, %xmm1
; SSE-NEXT:    movaps %xmm2, %xmm7
; SSE-NEXT:    mulss %xmm11, %xmm7
; SSE-NEXT:    unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1]
; SSE-NEXT:    mulps %xmm0, %xmm11
; SSE-NEXT:    movaps %xmm5, %xmm12
; SSE-NEXT:    mulss %xmm10, %xmm12
; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1]
; SSE-NEXT:    mulps %xmm3, %xmm10
; SSE-NEXT:    addps %xmm11, %xmm10
; SSE-NEXT:    movaps %xmm9, %xmm11
; SSE-NEXT:    mulss %xmm8, %xmm11
; SSE-NEXT:    unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1]
; SSE-NEXT:    mulps %xmm6, %xmm8
; SSE-NEXT:    addps %xmm10, %xmm8
; SSE-NEXT:    addss %xmm7, %xmm12
; SSE-NEXT:    addss %xmm11, %xmm12
; SSE-NEXT:    movaps %xmm8, %xmm7
; SSE-NEXT:    movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0]
; SSE-NEXT:    movss {{.*#+}} xmm10 = mem[0],zero,zero,zero
; SSE-NEXT:    mulss %xmm10, %xmm2
; SSE-NEXT:    unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1]
; SSE-NEXT:    mulps %xmm0, %xmm10
; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT:    mulss %xmm0, %xmm5
; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT:    mulps %xmm3, %xmm0
; SSE-NEXT:    addps %xmm10, %xmm0
; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE-NEXT:    mulss %xmm3, %xmm9
; SSE-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1]
; SSE-NEXT:    mulps %xmm6, %xmm3
; SSE-NEXT:    addps %xmm0, %xmm3
; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1]
; SSE-NEXT:    addss %xmm2, %xmm5
; SSE-NEXT:    addss %xmm9, %xmm5
; SSE-NEXT:    movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0]
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
; SSE-NEXT:    movss %xmm5, 32(%rdi)
; SSE-NEXT:    movaps %xmm7, 16(%rdi)
; SSE-NEXT:    movaps %xmm4, (%rdi)
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul3x3_f32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    movq %rdi, %rax
; AVX1-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm1
; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm9
; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
; AVX1-NEXT:    vmulps %xmm4, %xmm3, %xmm10
; AVX1-NEXT:    vaddps %xmm10, %xmm9, %xmm9
; AVX1-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
; AVX1-NEXT:    vmulps %xmm7, %xmm6, %xmm10
; AVX1-NEXT:    vaddps %xmm10, %xmm9, %xmm9
; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
; AVX1-NEXT:    vmulss %xmm4, %xmm5, %xmm4
; AVX1-NEXT:    vaddss %xmm4, %xmm1, %xmm1
; AVX1-NEXT:    vmulss %xmm7, %xmm8, %xmm4
; AVX1-NEXT:    vaddss %xmm4, %xmm1, %xmm1
; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3]
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
; AVX1-NEXT:    vmulps %xmm4, %xmm0, %xmm7
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
; AVX1-NEXT:    vmulps %xmm3, %xmm9, %xmm10
; AVX1-NEXT:    vaddps %xmm7, %xmm10, %xmm7
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
; AVX1-NEXT:    vmulps %xmm6, %xmm10, %xmm11
; AVX1-NEXT:    vaddps %xmm7, %xmm11, %xmm7
; AVX1-NEXT:    vmulss %xmm4, %xmm2, %xmm4
; AVX1-NEXT:    vmulss %xmm5, %xmm9, %xmm9
; AVX1-NEXT:    vaddss %xmm4, %xmm9, %xmm4
; AVX1-NEXT:    vmulss %xmm10, %xmm8, %xmm9
; AVX1-NEXT:    vaddss %xmm4, %xmm9, %xmm4
; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3]
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
; AVX1-NEXT:    vmulps %xmm0, %xmm9, %xmm0
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
; AVX1-NEXT:    vmulps %xmm3, %xmm10, %xmm3
; AVX1-NEXT:    vaddps %xmm3, %xmm0, %xmm0
; AVX1-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm3
; AVX1-NEXT:    vmulps %xmm3, %xmm6, %xmm6
; AVX1-NEXT:    vaddps %xmm6, %xmm0, %xmm0
; AVX1-NEXT:    vmulss %xmm2, %xmm9, %xmm2
; AVX1-NEXT:    vmulss %xmm5, %xmm10, %xmm5
; AVX1-NEXT:    vaddss %xmm5, %xmm2, %xmm2
; AVX1-NEXT:    vmulss %xmm3, %xmm8, %xmm3
; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2
; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0]
; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3]
; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX1-NEXT:    vmovss %xmm2, 32(%rdi)
; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul3x3_f32:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    movq %rdi, %rax
; AVX2-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm1
; AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm9
; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
; AVX2-NEXT:    vmulps %xmm4, %xmm3, %xmm10
; AVX2-NEXT:    vaddps %xmm10, %xmm9, %xmm9
; AVX2-NEXT:    vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
; AVX2-NEXT:    vmulps %xmm7, %xmm6, %xmm10
; AVX2-NEXT:    vaddps %xmm10, %xmm9, %xmm9
; AVX2-NEXT:    vmulss %xmm1, %xmm2, %xmm1
; AVX2-NEXT:    vmulss %xmm4, %xmm5, %xmm4
; AVX2-NEXT:    vaddss %xmm4, %xmm1, %xmm1
; AVX2-NEXT:    vmulss %xmm7, %xmm8, %xmm4
; AVX2-NEXT:    vaddss %xmm4, %xmm1, %xmm1
; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3]
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm4
; AVX2-NEXT:    vmulps %xmm4, %xmm0, %xmm7
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
; AVX2-NEXT:    vmulps %xmm3, %xmm9, %xmm10
; AVX2-NEXT:    vaddps %xmm7, %xmm10, %xmm7
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm10
; AVX2-NEXT:    vmulps %xmm6, %xmm10, %xmm11
; AVX2-NEXT:    vaddps %xmm7, %xmm11, %xmm7
; AVX2-NEXT:    vmulss %xmm4, %xmm2, %xmm4
; AVX2-NEXT:    vmulss %xmm5, %xmm9, %xmm9
; AVX2-NEXT:    vaddss %xmm4, %xmm9, %xmm4
; AVX2-NEXT:    vmulss %xmm10, %xmm8, %xmm9
; AVX2-NEXT:    vaddss %xmm4, %xmm9, %xmm4
; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3]
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm7
; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm0
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm9
; AVX2-NEXT:    vmulps %xmm3, %xmm9, %xmm3
; AVX2-NEXT:    vaddps %xmm3, %xmm0, %xmm0
; AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm3
; AVX2-NEXT:    vmulps %xmm3, %xmm6, %xmm6
; AVX2-NEXT:    vaddps %xmm6, %xmm0, %xmm0
; AVX2-NEXT:    vmulss %xmm7, %xmm2, %xmm2
; AVX2-NEXT:    vmulss %xmm5, %xmm9, %xmm5
; AVX2-NEXT:    vaddss %xmm5, %xmm2, %xmm2
; AVX2-NEXT:    vmulss %xmm3, %xmm8, %xmm3
; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = [0,1,2,4,5,6,u,u]
; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm1
; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT:    vmovss %xmm2, 32(%rdi)
; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: test_mul3x3_f32:
; AVX512F:       # %bb.0: # %entry
; AVX512F-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm3
; AVX512F-NEXT:    vmulps %xmm3, %xmm0, %xmm3
; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm5
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm4
; AVX512F-NEXT:    vaddps %xmm4, %xmm3, %xmm4
; AVX512F-NEXT:    vshufpd {{.*#+}} xmm3 = xmm5[1,0]
; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512F-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm3, %xmm9, %xmm9
; AVX512F-NEXT:    vaddps %xmm4, %xmm9, %xmm9
; AVX512F-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX512F-NEXT:    vmulss %xmm1, %xmm4, %xmm10
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3]
; AVX512F-NEXT:    vmulss %xmm6, %xmm5, %xmm6
; AVX512F-NEXT:    vaddss %xmm6, %xmm10, %xmm6
; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
; AVX512F-NEXT:    vmulss %xmm8, %xmm10, %xmm8
; AVX512F-NEXT:    vaddss %xmm6, %xmm8, %xmm6
; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3]
; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm8
; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm9
; AVX512F-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2]
; AVX512F-NEXT:    vmulps %xmm2, %xmm11, %xmm11
; AVX512F-NEXT:    vaddps %xmm11, %xmm8, %xmm8
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3]
; AVX512F-NEXT:    vmulps %xmm3, %xmm11, %xmm12
; AVX512F-NEXT:    vaddps %xmm12, %xmm8, %xmm8
; AVX512F-NEXT:    vmulss %xmm7, %xmm4, %xmm7
; AVX512F-NEXT:    vmulss %xmm5, %xmm9, %xmm12
; AVX512F-NEXT:    vaddss %xmm7, %xmm12, %xmm7
; AVX512F-NEXT:    vmulss %xmm11, %xmm10, %xmm11
; AVX512F-NEXT:    vaddss %xmm7, %xmm11, %xmm7
; AVX512F-NEXT:    vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3]
; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3]
; AVX512F-NEXT:    vshufpd {{.*#+}} xmm11 = xmm9[1,0]
; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm0, %xmm9, %xmm0
; AVX512F-NEXT:    vmulps %xmm2, %xmm8, %xmm2
; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm2
; AVX512F-NEXT:    vmulps %xmm2, %xmm3, %xmm2
; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512F-NEXT:    vmulss %xmm4, %xmm11, %xmm2
; AVX512F-NEXT:    vmulss %xmm5, %xmm8, %xmm3
; AVX512F-NEXT:    vaddss %xmm3, %xmm2, %xmm2
; AVX512F-NEXT:    vmulss %xmm1, %xmm10, %xmm1
; AVX512F-NEXT:    vaddss %xmm1, %xmm2, %xmm1
; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
; AVX512F-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
; AVX512F-NEXT:    retq
;
; AVX512VL-LABEL: test_mul3x3_f32:
; AVX512VL:       # %bb.0: # %entry
; AVX512VL-NEXT:    valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm3
; AVX512VL-NEXT:    vmulps %xmm3, %xmm0, %xmm3
; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm4
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
; AVX512VL-NEXT:    vmulps %xmm5, %xmm2, %xmm6
; AVX512VL-NEXT:    vaddps %xmm6, %xmm3, %xmm3
; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm6 = xmm4[1,0]
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm8 = xmm1[1,0]
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm9, %xmm9
; AVX512VL-NEXT:    vaddps %xmm3, %xmm9, %xmm3
; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm9 = xmm0[1,0]
; AVX512VL-NEXT:    vmulss %xmm1, %xmm9, %xmm10
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3]
; AVX512VL-NEXT:    vmulss %xmm5, %xmm4, %xmm5
; AVX512VL-NEXT:    vaddss %xmm5, %xmm10, %xmm5
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
; AVX512VL-NEXT:    vmulss %xmm8, %xmm10, %xmm8
; AVX512VL-NEXT:    vaddss %xmm5, %xmm8, %xmm5
; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm5
; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm8
; AVX512VL-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2]
; AVX512VL-NEXT:    vmulps %xmm2, %xmm11, %xmm11
; AVX512VL-NEXT:    vaddps %xmm5, %xmm11, %xmm5
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm11, %xmm12
; AVX512VL-NEXT:    vaddps %xmm5, %xmm12, %xmm5
; AVX512VL-NEXT:    vmulss %xmm7, %xmm9, %xmm7
; AVX512VL-NEXT:    vmulss %xmm4, %xmm8, %xmm12
; AVX512VL-NEXT:    vaddss %xmm7, %xmm12, %xmm7
; AVX512VL-NEXT:    vmulss %xmm11, %xmm10, %xmm11
; AVX512VL-NEXT:    vaddss %xmm7, %xmm11, %xmm7
; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm11 = xmm8[1,0]
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
; AVX512VL-NEXT:    vmulps %xmm7, %xmm2, %xmm2
; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm2
; AVX512VL-NEXT:    vmulps %xmm2, %xmm6, %xmm2
; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT:    vmulss %xmm11, %xmm9, %xmm2
; AVX512VL-NEXT:    vmulss %xmm7, %xmm4, %xmm4
; AVX512VL-NEXT:    vaddss %xmm4, %xmm2, %xmm2
; AVX512VL-NEXT:    vmulss %xmm1, %xmm10, %xmm1
; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
; AVX512VL-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT:    retq
entry:
  %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 0, i32 1>
  %splat.splat = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> zeroinitializer
  %0 = fmul <2 x float> %block, %splat.splat
  %block6 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 3, i32 4>
  %splat.splat8 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 1, i32 1>
  %1 = fmul <2 x float> %block6, %splat.splat8
  %2 = fadd <2 x float> %0, %1
  %block9 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 6, i32 7>
  %splat.splat11 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 2, i32 2>
  %3 = fmul <2 x float> %block9, %splat.splat11
  %4 = fadd <2 x float> %2, %3
  %5 = shufflevector <2 x float> %4, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %block12 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 2>
  %splat.splatinsert13 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> zeroinitializer
  %6 = fmul <1 x float> %block12, %splat.splatinsert13
  %block15 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 5>
  %splat.splatinsert16 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 1>
  %7 = fmul <1 x float> %block15, %splat.splatinsert16
  %8 = fadd <1 x float> %6, %7
  %block18 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> <i32 8>
  %splat.splatinsert19 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 2>
  %9 = fmul <1 x float> %block18, %splat.splatinsert19
  %10 = fadd <1 x float> %8, %9
  %11 = shufflevector <1 x float> %10, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %12 = shufflevector <3 x float> %5, <3 x float> %11, <3 x i32> <i32 0, i32 1, i32 3>
  %splat.splat23 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 3, i32 3>
  %13 = fmul <2 x float> %block, %splat.splat23
  %splat.splat26 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 4, i32 4>
  %14 = fmul <2 x float> %block6, %splat.splat26
  %15 = fadd <2 x float> %13, %14
  %splat.splat29 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 5, i32 5>
  %16 = fmul <2 x float> %block9, %splat.splat29
  %17 = fadd <2 x float> %15, %16
  %18 = shufflevector <2 x float> %17, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %splat.splatinsert31 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 3>
  %19 = fmul <1 x float> %block12, %splat.splatinsert31
  %splat.splatinsert34 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 4>
  %20 = fmul <1 x float> %block15, %splat.splatinsert34
  %21 = fadd <1 x float> %19, %20
  %splat.splatinsert37 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 5>
  %22 = fmul <1 x float> %block18, %splat.splatinsert37
  %23 = fadd <1 x float> %21, %22
  %24 = shufflevector <1 x float> %23, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %25 = shufflevector <3 x float> %18, <3 x float> %24, <3 x i32> <i32 0, i32 1, i32 3>
  %splat.splat41 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 6, i32 6>
  %26 = fmul <2 x float> %block, %splat.splat41
  %splat.splat44 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 7, i32 7>
  %27 = fmul <2 x float> %block6, %splat.splat44
  %28 = fadd <2 x float> %26, %27
  %splat.splat47 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> <i32 8, i32 8>
  %29 = fmul <2 x float> %block9, %splat.splat47
  %30 = fadd <2 x float> %28, %29
  %31 = shufflevector <2 x float> %30, <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %splat.splatinsert49 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 6>
  %32 = fmul <1 x float> %block12, %splat.splatinsert49
  %splat.splatinsert52 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> <i32 7>
  %33 = fmul <1 x float> %block15, %splat.splatinsert52
  %34 = fadd <1 x float> %32, %33
  %35 = fmul <9 x float> %a0, %a1
  %36 = shufflevector <9 x float> %35, <9 x float> poison, <1 x i32> <i32 8>
  %37 = fadd <1 x float> %34, %36
  %38 = shufflevector <1 x float> %37, <1 x float> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %39 = shufflevector <3 x float> %31, <3 x float> %38, <3 x i32> <i32 0, i32 1, i32 3>
  %40 = shufflevector <3 x float> %12, <3 x float> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
  %41 = shufflevector <3 x float> %39, <3 x float> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
  %42 = shufflevector <6 x float> %40, <6 x float> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
  ret <9 x float> %42
}

define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwind {
; SSE-LABEL: test_mul3x3_f64:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    movsd {{.*#+}} xmm8 = mem[0],zero
; SSE-NEXT:    movsd {{.*#+}} xmm10 = mem[0],zero
; SSE-NEXT:    movsd {{.*#+}} xmm9 = mem[0],zero
; SSE-NEXT:    movsd {{.*#+}} xmm11 = mem[0],zero
; SSE-NEXT:    movsd {{.*#+}} xmm12 = mem[0],zero
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd %xmm2, %xmm13
; SSE-NEXT:    mulsd %xmm12, %xmm13
; SSE-NEXT:    unpcklpd {{.*#+}} xmm12 = xmm12[0,0]
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; SSE-NEXT:    movapd %xmm5, %xmm1
; SSE-NEXT:    mulsd %xmm11, %xmm1
; SSE-NEXT:    unpcklpd {{.*#+}} xmm11 = xmm11[0,0]
; SSE-NEXT:    mulpd %xmm3, %xmm11
; SSE-NEXT:    addpd %xmm12, %xmm11
; SSE-NEXT:    movsd {{.*#+}} xmm4 = mem[0],zero
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; SSE-NEXT:    movapd %xmm9, %xmm7
; SSE-NEXT:    mulsd %xmm4, %xmm7
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0,0]
; SSE-NEXT:    mulpd %xmm6, %xmm4
; SSE-NEXT:    addpd %xmm11, %xmm4
; SSE-NEXT:    movsd {{.*#+}} xmm11 = mem[0],zero
; SSE-NEXT:    addsd %xmm13, %xmm1
; SSE-NEXT:    addsd %xmm7, %xmm1
; SSE-NEXT:    movapd %xmm2, %xmm12
; SSE-NEXT:    mulsd %xmm11, %xmm12
; SSE-NEXT:    unpcklpd {{.*#+}} xmm11 = xmm11[0,0]
; SSE-NEXT:    mulpd %xmm0, %xmm11
; SSE-NEXT:    movapd %xmm5, %xmm7
; SSE-NEXT:    mulsd %xmm10, %xmm7
; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0,0]
; SSE-NEXT:    mulpd %xmm3, %xmm10
; SSE-NEXT:    addpd %xmm11, %xmm10
; SSE-NEXT:    movapd %xmm9, %xmm11
; SSE-NEXT:    mulsd %xmm8, %xmm11
; SSE-NEXT:    unpcklpd {{.*#+}} xmm8 = xmm8[0,0]
; SSE-NEXT:    mulpd %xmm6, %xmm8
; SSE-NEXT:    addpd %xmm10, %xmm8
; SSE-NEXT:    addsd %xmm12, %xmm7
; SSE-NEXT:    addsd %xmm11, %xmm7
; SSE-NEXT:    movsd {{.*#+}} xmm10 = mem[0],zero
; SSE-NEXT:    mulsd %xmm10, %xmm2
; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0,0]
; SSE-NEXT:    mulpd %xmm0, %xmm10
; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT:    mulsd %xmm0, %xmm5
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT:    mulpd %xmm3, %xmm0
; SSE-NEXT:    addpd %xmm10, %xmm0
; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
; SSE-NEXT:    mulsd %xmm3, %xmm9
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0,0]
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    addsd %xmm2, %xmm5
; SSE-NEXT:    addsd %xmm9, %xmm5
; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0]
; SSE-NEXT:    shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0]
; SSE-NEXT:    movsd %xmm5, 64(%rdi)
; SSE-NEXT:    movapd %xmm3, 48(%rdi)
; SSE-NEXT:    movapd %xmm4, (%rdi)
; SSE-NEXT:    movapd %xmm8, 32(%rdi)
; SSE-NEXT:    movapd %xmm1, 16(%rdi)
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul3x3_f64:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    movq %rdi, %rax
; AVX1-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; AVX1-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm1, %xmm9, %xmm0
; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
; AVX1-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
; AVX1-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
; AVX1-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
; AVX1-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
; AVX1-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX1-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vaddsd %xmm7, %xmm4, %xmm4
; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm7, %xmm1, %xmm9
; AVX1-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm3, %xmm10, %xmm11
; AVX1-NEXT:    vaddpd %xmm11, %xmm9, %xmm9
; AVX1-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm6, %xmm11, %xmm12
; AVX1-NEXT:    vaddpd %xmm12, %xmm9, %xmm9
; AVX1-NEXT:    vmulsd %xmm7, %xmm2, %xmm7
; AVX1-NEXT:    vmulsd %xmm5, %xmm10, %xmm10
; AVX1-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
; AVX1-NEXT:    vmulsd %xmm11, %xmm8, %xmm10
; AVX1-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
; AVX1-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm1, %xmm10, %xmm1
; AVX1-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm3, %xmm11, %xmm3
; AVX1-NEXT:    vaddpd %xmm3, %xmm1, %xmm1
; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX1-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
; AVX1-NEXT:    vaddpd %xmm6, %xmm1, %xmm1
; AVX1-NEXT:    vmulsd %xmm2, %xmm10, %xmm2
; AVX1-NEXT:    vmulsd %xmm5, %xmm11, %xmm5
; AVX1-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
; AVX1-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
; AVX1-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm3
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
; AVX1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
; AVX1-NEXT:    vmovsd %xmm2, 64(%rdi)
; AVX1-NEXT:    vmovapd %ymm1, 32(%rdi)
; AVX1-NEXT:    vmovapd %ymm0, (%rdi)
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul3x3_f64:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    movq %rdi, %rax
; AVX2-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; AVX2-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm1, %xmm9, %xmm0
; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
; AVX2-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
; AVX2-NEXT:    vaddpd %xmm0, %xmm10, %xmm0
; AVX2-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
; AVX2-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
; AVX2-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX2-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vaddsd %xmm7, %xmm4, %xmm4
; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm7, %xmm1, %xmm9
; AVX2-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm3, %xmm10, %xmm11
; AVX2-NEXT:    vaddpd %xmm11, %xmm9, %xmm9
; AVX2-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm6, %xmm11, %xmm12
; AVX2-NEXT:    vaddpd %xmm12, %xmm9, %xmm9
; AVX2-NEXT:    vmulsd %xmm7, %xmm2, %xmm7
; AVX2-NEXT:    vmulsd %xmm5, %xmm10, %xmm10
; AVX2-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
; AVX2-NEXT:    vmulsd %xmm11, %xmm8, %xmm10
; AVX2-NEXT:    vaddsd %xmm7, %xmm10, %xmm7
; AVX2-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm1, %xmm10, %xmm1
; AVX2-NEXT:    vmovddup {{.*#+}} xmm11 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm3, %xmm11, %xmm3
; AVX2-NEXT:    vaddpd %xmm3, %xmm1, %xmm1
; AVX2-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX2-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
; AVX2-NEXT:    vaddpd %xmm6, %xmm1, %xmm1
; AVX2-NEXT:    vmulsd %xmm2, %xmm10, %xmm2
; AVX2-NEXT:    vmulsd %xmm5, %xmm11, %xmm5
; AVX2-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
; AVX2-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
; AVX2-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
; AVX2-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
; AVX2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm3
; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
; AVX2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
; AVX2-NEXT:    vmovsd %xmm2, 64(%rdi)
; AVX2-NEXT:    vmovapd %ymm1, 32(%rdi)
; AVX2-NEXT:    vmovapd %ymm0, (%rdi)
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: test_mul3x3_f64:
; AVX512F:       # %bb.0: # %entry
; AVX512F-NEXT:    movq %rdi, %rax
; AVX512F-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm0, %xmm9, %xmm10
; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm3, %xmm1, %xmm4
; AVX512F-NEXT:    vaddpd %xmm4, %xmm10, %xmm4
; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
; AVX512F-NEXT:    vaddpd %xmm4, %xmm10, %xmm4
; AVX512F-NEXT:    vmulsd %xmm2, %xmm9, %xmm9
; AVX512F-NEXT:    vmulsd %xmm3, %xmm5, %xmm3
; AVX512F-NEXT:    vaddsd %xmm3, %xmm9, %xmm3
; AVX512F-NEXT:    vmulsd %xmm7, %xmm8, %xmm7
; AVX512F-NEXT:    vaddsd %xmm7, %xmm3, %xmm3
; AVX512F-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm4, %xmm0, %xmm7
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm1, %xmm9, %xmm10
; AVX512F-NEXT:    vaddpd %xmm7, %xmm10, %xmm7
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm6, %xmm10, %xmm11
; AVX512F-NEXT:    vaddpd %xmm7, %xmm11, %xmm7
; AVX512F-NEXT:    vmulsd %xmm4, %xmm2, %xmm4
; AVX512F-NEXT:    vmulsd %xmm5, %xmm9, %xmm9
; AVX512F-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX512F-NEXT:    vmulsd %xmm10, %xmm8, %xmm9
; AVX512F-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX512F-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm7, %xmm0, %xmm0
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm1, %xmm9, %xmm1
; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX512F-NEXT:    vmulpd %xmm1, %xmm6, %xmm6
; AVX512F-NEXT:    vaddpd %xmm6, %xmm0, %xmm0
; AVX512F-NEXT:    vmulsd %xmm7, %xmm2, %xmm2
; AVX512F-NEXT:    vmulsd %xmm5, %xmm9, %xmm5
; AVX512F-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
; AVX512F-NEXT:    vmulsd %xmm1, %xmm8, %xmm1
; AVX512F-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm4, %zmm3, %zmm2
; AVX512F-NEXT:    vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
; AVX512F-NEXT:    vpermi2pd %zmm0, %zmm2, %zmm3
; AVX512F-NEXT:    vmovsd %xmm1, 64(%rdi)
; AVX512F-NEXT:    vmovapd %zmm3, (%rdi)
; AVX512F-NEXT:    vzeroupper
; AVX512F-NEXT:    retq
;
; AVX512VL-LABEL: test_mul3x3_f64:
; AVX512VL:       # %bb.0: # %entry
; AVX512VL-NEXT:    movq %rdi, %rax
; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm1, %xmm0, %xmm9
; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm4, %xmm3, %xmm10
; AVX512VL-NEXT:    vaddpd %xmm10, %xmm9, %xmm9
; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm7, %xmm6, %xmm10
; AVX512VL-NEXT:    vaddpd %xmm10, %xmm9, %xmm9
; AVX512VL-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT:    vmulsd %xmm4, %xmm5, %xmm4
; AVX512VL-NEXT:    vaddsd %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT:    vmulsd %xmm7, %xmm8, %xmm4
; AVX512VL-NEXT:    vaddsd %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm9, %ymm1
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm4 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm4, %xmm0, %xmm7
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm3, %xmm9, %xmm10
; AVX512VL-NEXT:    vaddpd %xmm7, %xmm10, %xmm7
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm6, %xmm10, %xmm11
; AVX512VL-NEXT:    vaddpd %xmm7, %xmm11, %xmm7
; AVX512VL-NEXT:    vmulsd %xmm4, %xmm2, %xmm4
; AVX512VL-NEXT:    vmulsd %xmm5, %xmm9, %xmm9
; AVX512VL-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX512VL-NEXT:    vmulsd %xmm10, %xmm8, %xmm9
; AVX512VL-NEXT:    vaddsd %xmm4, %xmm9, %xmm4
; AVX512VL-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm4
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm7, %xmm0, %xmm0
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm9 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm3, %xmm9, %xmm3
; AVX512VL-NEXT:    vaddpd %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX512VL-NEXT:    vmulpd %xmm3, %xmm6, %xmm6
; AVX512VL-NEXT:    vaddpd %xmm6, %xmm0, %xmm0
; AVX512VL-NEXT:    vmulsd %xmm7, %xmm2, %xmm2
; AVX512VL-NEXT:    vmulsd %xmm5, %xmm9, %xmm5
; AVX512VL-NEXT:    vaddsd %xmm5, %xmm2, %xmm2
; AVX512VL-NEXT:    vmulsd %xmm3, %xmm8, %xmm3
; AVX512VL-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; AVX512VL-NEXT:    vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9]
; AVX512VL-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm3
; AVX512VL-NEXT:    vmovsd %xmm2, 64(%rdi)
; AVX512VL-NEXT:    vmovapd %zmm3, (%rdi)
; AVX512VL-NEXT:    vzeroupper
; AVX512VL-NEXT:    retq
entry:
  %block = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 0, i32 1>
  %splat.splat = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> zeroinitializer
  %0 = fmul <2 x double> %block, %splat.splat
  %block6 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 3, i32 4>
  %splat.splat8 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 1, i32 1>
  %1 = fmul <2 x double> %block6, %splat.splat8
  %2 = fadd <2 x double> %0, %1
  %block9 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> <i32 6, i32 7>
  %splat.splat11 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 2, i32 2>
  %3 = fmul <2 x double> %block9, %splat.splat11
  %4 = fadd <2 x double> %2, %3
  %5 = shufflevector <2 x double> %4, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %block12 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 2>
  %splat.splatinsert13 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> zeroinitializer
  %6 = fmul <1 x double> %block12, %splat.splatinsert13
  %block15 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 5>
  %splat.splatinsert16 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 1>
  %7 = fmul <1 x double> %block15, %splat.splatinsert16
  %8 = fadd <1 x double> %6, %7
  %block18 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> <i32 8>
  %splat.splatinsert19 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 2>
  %9 = fmul <1 x double> %block18, %splat.splatinsert19
  %10 = fadd <1 x double> %8, %9
  %11 = shufflevector <1 x double> %10, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %12 = shufflevector <3 x double> %5, <3 x double> %11, <3 x i32> <i32 0, i32 1, i32 3>
  %splat.splat23 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 3, i32 3>
  %13 = fmul <2 x double> %block, %splat.splat23
  %splat.splat26 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 4, i32 4>
  %14 = fmul <2 x double> %block6, %splat.splat26
  %15 = fadd <2 x double> %13, %14
  %splat.splat29 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 5, i32 5>
  %16 = fmul <2 x double> %block9, %splat.splat29
  %17 = fadd <2 x double> %15, %16
  %18 = shufflevector <2 x double> %17, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %splat.splatinsert31 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 3>
  %19 = fmul <1 x double> %block12, %splat.splatinsert31
  %splat.splatinsert34 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 4>
  %20 = fmul <1 x double> %block15, %splat.splatinsert34
  %21 = fadd <1 x double> %19, %20
  %splat.splatinsert37 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 5>
  %22 = fmul <1 x double> %block18, %splat.splatinsert37
  %23 = fadd <1 x double> %21, %22
  %24 = shufflevector <1 x double> %23, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %25 = shufflevector <3 x double> %18, <3 x double> %24, <3 x i32> <i32 0, i32 1, i32 3>
  %splat.splat41 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 6, i32 6>
  %26 = fmul <2 x double> %block, %splat.splat41
  %splat.splat44 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 7, i32 7>
  %27 = fmul <2 x double> %block6, %splat.splat44
  %28 = fadd <2 x double> %26, %27
  %splat.splat47 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> <i32 8, i32 8>
  %29 = fmul <2 x double> %block9, %splat.splat47
  %30 = fadd <2 x double> %28, %29
  %31 = shufflevector <2 x double> %30, <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 undef>
  %splat.splatinsert49 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 6>
  %32 = fmul <1 x double> %block12, %splat.splatinsert49
  %splat.splatinsert52 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> <i32 7>
  %33 = fmul <1 x double> %block15, %splat.splatinsert52
  %34 = fadd <1 x double> %32, %33
  %35 = fmul <9 x double> %a0, %a1
  %36 = shufflevector <9 x double> %35, <9 x double> poison, <1 x i32> <i32 8>
  %37 = fadd <1 x double> %34, %36
  %38 = shufflevector <1 x double> %37, <1 x double> poison, <3 x i32> <i32 0, i32 undef, i32 undef>
  %39 = shufflevector <3 x double> %31, <3 x double> %38, <3 x i32> <i32 0, i32 1, i32 3>
  %40 = shufflevector <3 x double> %12, <3 x double> %25, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
  %41 = shufflevector <3 x double> %39, <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
  %42 = shufflevector <6 x double> %40, <6 x double> %41, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
  ret <9 x double> %42
}

define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; SSE-LABEL: test_mul4x4_f32:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movaps %xmm0, %xmm9
; SSE-NEXT:    movaps %xmm4, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[0,0]
; SSE-NEXT:    mulps %xmm9, %xmm0
; SSE-NEXT:    movaps %xmm4, %xmm8
; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1]
; SSE-NEXT:    mulps %xmm1, %xmm8
; SSE-NEXT:    addps %xmm0, %xmm8
; SSE-NEXT:    movaps %xmm4, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
; SSE-NEXT:    mulps %xmm2, %xmm0
; SSE-NEXT:    addps %xmm8, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; SSE-NEXT:    mulps %xmm3, %xmm4
; SSE-NEXT:    addps %xmm4, %xmm0
; SSE-NEXT:    movaps %xmm5, %xmm4
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0]
; SSE-NEXT:    mulps %xmm9, %xmm4
; SSE-NEXT:    movaps %xmm5, %xmm10
; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,1],xmm5[1,1]
; SSE-NEXT:    mulps %xmm1, %xmm10
; SSE-NEXT:    addps %xmm4, %xmm10
; SSE-NEXT:    movaps %xmm5, %xmm8
; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,2],xmm5[2,2]
; SSE-NEXT:    mulps %xmm2, %xmm8
; SSE-NEXT:    addps %xmm10, %xmm8
; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
; SSE-NEXT:    mulps %xmm3, %xmm5
; SSE-NEXT:    addps %xmm5, %xmm8
; SSE-NEXT:    movaps %xmm6, %xmm4
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm6[0,0]
; SSE-NEXT:    mulps %xmm9, %xmm4
; SSE-NEXT:    movaps %xmm6, %xmm10
; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[1,1],xmm6[1,1]
; SSE-NEXT:    mulps %xmm1, %xmm10
; SSE-NEXT:    addps %xmm4, %xmm10
; SSE-NEXT:    movaps %xmm6, %xmm5
; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2]
; SSE-NEXT:    mulps %xmm2, %xmm5
; SSE-NEXT:    addps %xmm10, %xmm5
; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; SSE-NEXT:    mulps %xmm3, %xmm6
; SSE-NEXT:    addps %xmm6, %xmm5
; SSE-NEXT:    movaps %xmm7, %xmm4
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0],xmm7[0,0]
; SSE-NEXT:    mulps %xmm9, %xmm4
; SSE-NEXT:    movaps %xmm7, %xmm6
; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1]
; SSE-NEXT:    mulps %xmm1, %xmm6
; SSE-NEXT:    addps %xmm4, %xmm6
; SSE-NEXT:    movaps %xmm7, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[2,2]
; SSE-NEXT:    mulps %xmm2, %xmm1
; SSE-NEXT:    addps %xmm6, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
; SSE-NEXT:    mulps %xmm7, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm8, %xmm1
; SSE-NEXT:    movaps %xmm5, %xmm2
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul4x4_f32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT:    vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0]
; AVX1-NEXT:    vmulps %xmm6, %xmm0, %xmm6
; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
; AVX1-NEXT:    vmulps %xmm7, %xmm5, %xmm7
; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
; AVX1-NEXT:    vmulps %xmm7, %xmm1, %xmm7
; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
; AVX1-NEXT:    vmulps %xmm7, %xmm4, %xmm7
; AVX1-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0]
; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX1-NEXT:    vmulps %xmm2, %xmm4, %xmm2
; AVX1-NEXT:    vaddps %xmm2, %xmm7, %xmm2
; AVX1-NEXT:    vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0]
; AVX1-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm8
; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
; AVX1-NEXT:    vmulps %xmm1, %xmm8, %xmm8
; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
; AVX1-NEXT:    vmulps %xmm4, %xmm8, %xmm8
; AVX1-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0]
; AVX1-NEXT:    vmulps %xmm0, %xmm8, %xmm0
; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX1-NEXT:    vmulps %xmm5, %xmm8, %xmm5
; AVX1-NEXT:    vaddps %xmm5, %xmm0, %xmm0
; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
; AVX1-NEXT:    vmulps %xmm5, %xmm1, %xmm1
; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX1-NEXT:    vmulps %xmm1, %xmm4, %xmm1
; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm1
; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul4x4_f32:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
; AVX2-NEXT:    vbroadcastss %xmm2, %xmm6
; AVX2-NEXT:    vmulps %xmm6, %xmm0, %xmm6
; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
; AVX2-NEXT:    vmulps %xmm7, %xmm5, %xmm7
; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
; AVX2-NEXT:    vmulps %xmm7, %xmm1, %xmm7
; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX2-NEXT:    vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
; AVX2-NEXT:    vmulps %xmm7, %xmm4, %xmm7
; AVX2-NEXT:    vaddps %xmm7, %xmm6, %xmm6
; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm2
; AVX2-NEXT:    vbroadcastss %xmm2, %xmm7
; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; AVX2-NEXT:    vmulps %xmm2, %xmm4, %xmm2
; AVX2-NEXT:    vaddps %xmm2, %xmm7, %xmm2
; AVX2-NEXT:    vbroadcastss %xmm3, %xmm7
; AVX2-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm8
; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
; AVX2-NEXT:    vmulps %xmm1, %xmm8, %xmm8
; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
; AVX2-NEXT:    vmulps %xmm4, %xmm8, %xmm8
; AVX2-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm3
; AVX2-NEXT:    vbroadcastss %xmm3, %xmm8
; AVX2-NEXT:    vmulps %xmm0, %xmm8, %xmm0
; AVX2-NEXT:    vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
; AVX2-NEXT:    vmulps %xmm5, %xmm8, %xmm5
; AVX2-NEXT:    vaddps %xmm5, %xmm0, %xmm0
; AVX2-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
; AVX2-NEXT:    vmulps %xmm5, %xmm1, %xmm1
; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
; AVX2-NEXT:    vmulps %xmm1, %xmm4, %xmm1
; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm1
; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm0
; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm7, %ymm1
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: test_mul4x4_f32:
; AVX512F:       # %bb.0: # %entry
; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm4
; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm5
; AVX512F-NEXT:    vmulps %xmm5, %xmm0, %xmm5
; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
; AVX512F-NEXT:    vmulps %xmm6, %xmm4, %xmm6
; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm6, %xmm3, %xmm6
; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
; AVX512F-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm6
; AVX512F-NEXT:    vbroadcastss %xmm6, %xmm7
; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm8
; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm3, %xmm8, %xmm8
; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512F-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX512F-NEXT:    vmulps %xmm6, %xmm2, %xmm6
; AVX512F-NEXT:    vaddps %xmm6, %xmm7, %xmm6
; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
; AVX512F-NEXT:    vbroadcastss %xmm7, %xmm8
; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm8
; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
; AVX512F-NEXT:    vmulps %xmm4, %xmm9, %xmm9
; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm3, %xmm9, %xmm9
; AVX512F-NEXT:    vaddps %xmm9, %xmm8, %xmm8
; AVX512F-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
; AVX512F-NEXT:    vmulps %xmm7, %xmm2, %xmm7
; AVX512F-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm8
; AVX512F-NEXT:    vmulps %xmm0, %xmm8, %xmm0
; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
; AVX512F-NEXT:    vmulps %xmm4, %xmm8, %xmm4
; AVX512F-NEXT:    vaddps %xmm4, %xmm0, %xmm0
; AVX512F-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %xmm4, %xmm3, %xmm3
; AVX512F-NEXT:    vaddps %xmm3, %xmm0, %xmm0
; AVX512F-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512F-NEXT:    vmulps %xmm1, %xmm2, %xmm1
; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
; AVX512F-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm1
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT:    retq
;
; AVX512VL-LABEL: test_mul4x4_f32:
; AVX512VL:       # %bb.0: # %entry
; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm2
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm0, %xmm4
; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm5
; AVX512VL-NEXT:    vmulps %xmm5, %xmm0, %xmm5
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm2, %xmm6
; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm3, %xmm6
; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
; AVX512VL-NEXT:    vaddps %xmm6, %xmm5, %xmm5
; AVX512VL-NEXT:    vextractf128 $1, %ymm1, %xmm6
; AVX512VL-NEXT:    vbroadcastss %xmm6, %xmm7
; AVX512VL-NEXT:    vmulps %xmm7, %xmm0, %xmm7
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm8
; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm3, %xmm8, %xmm8
; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX512VL-NEXT:    vmulps %xmm6, %xmm4, %xmm6
; AVX512VL-NEXT:    vaddps %xmm6, %xmm7, %xmm6
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm7
; AVX512VL-NEXT:    vbroadcastss %xmm7, %xmm8
; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm8
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
; AVX512VL-NEXT:    vmulps %xmm2, %xmm9, %xmm9
; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm3, %xmm9, %xmm9
; AVX512VL-NEXT:    vaddps %xmm9, %xmm8, %xmm8
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
; AVX512VL-NEXT:    vmulps %xmm7, %xmm4, %xmm7
; AVX512VL-NEXT:    vaddps %xmm7, %xmm8, %xmm7
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm8
; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
; AVX512VL-NEXT:    vmulps %xmm2, %xmm8, %xmm2
; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %xmm2, %xmm3, %xmm2
; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX512VL-NEXT:    vmulps %xmm1, %xmm4, %xmm1
; AVX512VL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
; AVX512VL-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm1
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VL-NEXT:    retq
entry:
  %split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %split2 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %split3 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %splat.splat = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> zeroinitializer
  %0 = fmul <4 x float> %split, %splat.splat
  %splat.splat10 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  %1 = fmul <4 x float> %split1, %splat.splat10
  %2 = fadd <4 x float> %0, %1
  %splat.splat13 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  %3 = fmul <4 x float> %split2, %splat.splat13
  %4 = fadd <4 x float> %2, %3
  %splat.splat16 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %5 = fmul <4 x float> %split3, %splat.splat16
  %6 = fadd <4 x float> %4, %5
  %splat.splat19 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
  %7 = fmul <4 x float> %split, %splat.splat19
  %splat.splat22 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
  %8 = fmul <4 x float> %split1, %splat.splat22
  %9 = fadd <4 x float> %7, %8
  %splat.splat25 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
  %10 = fmul <4 x float> %split2, %splat.splat25
  %11 = fadd <4 x float> %9, %10
  %splat.splat28 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
  %12 = fmul <4 x float> %split3, %splat.splat28
  %13 = fadd <4 x float> %11, %12
  %splat.splat31 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8>
  %14 = fmul <4 x float> %split, %splat.splat31
  %splat.splat34 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
  %15 = fmul <4 x float> %split1, %splat.splat34
  %16 = fadd <4 x float> %14, %15
  %splat.splat37 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10>
  %17 = fmul <4 x float> %split2, %splat.splat37
  %18 = fadd <4 x float> %16, %17
  %splat.splat40 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
  %19 = fmul <4 x float> %split3, %splat.splat40
  %20 = fadd <4 x float> %18, %19
  %splat.splat43 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12>
  %21 = fmul <4 x float> %split, %splat.splat43
  %splat.splat46 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13>
  %22 = fmul <4 x float> %split1, %splat.splat46
  %23 = fadd <4 x float> %21, %22
  %splat.splat49 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14>
  %24 = fmul <4 x float> %split2, %splat.splat49
  %25 = fadd <4 x float> %23, %24
  %splat.splat52 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15>
  %26 = fmul <4 x float> %split3, %splat.splat52
  %27 = fadd <4 x float> %25, %26
  %28 = shufflevector <4 x float> %6, <4 x float> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %29 = shufflevector <4 x float> %20, <4 x float> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %30 = shufflevector <8 x float> %28, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x float> %30
}

define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) nounwind {
; SSE-LABEL: test_mul4x4_f64:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm5, %xmm6
; SSE-NEXT:    movapd %xmm4, %xmm5
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    movapd %xmm10, %xmm13
; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0]
; SSE-NEXT:    movapd %xmm1, %xmm14
; SSE-NEXT:    mulpd %xmm13, %xmm14
; SSE-NEXT:    mulpd %xmm0, %xmm13
; SSE-NEXT:    unpckhpd {{.*#+}} xmm10 = xmm10[1,1]
; SSE-NEXT:    movapd %xmm3, %xmm15
; SSE-NEXT:    mulpd %xmm10, %xmm15
; SSE-NEXT:    addpd %xmm14, %xmm15
; SSE-NEXT:    mulpd %xmm2, %xmm10
; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    addpd %xmm13, %xmm10
; SSE-NEXT:    movapd %xmm8, %xmm13
; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0]
; SSE-NEXT:    movapd %xmm4, %xmm14
; SSE-NEXT:    mulpd %xmm13, %xmm14
; SSE-NEXT:    addpd %xmm10, %xmm14
; SSE-NEXT:    movapd %xmm6, %xmm4
; SSE-NEXT:    mulpd %xmm6, %xmm13
; SSE-NEXT:    addpd %xmm15, %xmm13
; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
; SSE-NEXT:    movapd %xmm7, %xmm10
; SSE-NEXT:    mulpd %xmm8, %xmm10
; SSE-NEXT:    addpd %xmm13, %xmm10
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm6, %xmm8
; SSE-NEXT:    addpd %xmm14, %xmm8
; SSE-NEXT:    movapd %xmm12, %xmm13
; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0]
; SSE-NEXT:    movapd %xmm1, %xmm14
; SSE-NEXT:    mulpd %xmm13, %xmm14
; SSE-NEXT:    mulpd %xmm0, %xmm13
; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1,1]
; SSE-NEXT:    movapd %xmm3, %xmm15
; SSE-NEXT:    mulpd %xmm12, %xmm15
; SSE-NEXT:    addpd %xmm14, %xmm15
; SSE-NEXT:    mulpd %xmm2, %xmm12
; SSE-NEXT:    addpd %xmm13, %xmm12
; SSE-NEXT:    movapd %xmm9, %xmm13
; SSE-NEXT:    unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0]
; SSE-NEXT:    movapd %xmm5, %xmm14
; SSE-NEXT:    mulpd %xmm13, %xmm14
; SSE-NEXT:    addpd %xmm12, %xmm14
; SSE-NEXT:    mulpd %xmm4, %xmm13
; SSE-NEXT:    movapd %xmm4, %xmm2
; SSE-NEXT:    addpd %xmm15, %xmm13
; SSE-NEXT:    unpckhpd {{.*#+}} xmm9 = xmm9[1,1]
; SSE-NEXT:    movapd %xmm7, %xmm12
; SSE-NEXT:    mulpd %xmm9, %xmm12
; SSE-NEXT:    addpd %xmm13, %xmm12
; SSE-NEXT:    mulpd %xmm6, %xmm9
; SSE-NEXT:    addpd %xmm14, %xmm9
; SSE-NEXT:    movapd %xmm11, %xmm14
; SSE-NEXT:    unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm11[0]
; SSE-NEXT:    movapd %xmm1, %xmm13
; SSE-NEXT:    mulpd %xmm14, %xmm13
; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
; SSE-NEXT:    movapd %xmm3, %xmm15
; SSE-NEXT:    mulpd %xmm11, %xmm15
; SSE-NEXT:    addpd %xmm13, %xmm15
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulpd %xmm0, %xmm14
; SSE-NEXT:    movapd %xmm0, %xmm6
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm0, %xmm11
; SSE-NEXT:    addpd %xmm14, %xmm11
; SSE-NEXT:    movapd %xmm13, %xmm14
; SSE-NEXT:    unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0]
; SSE-NEXT:    movapd %xmm5, %xmm4
; SSE-NEXT:    mulpd %xmm14, %xmm4
; SSE-NEXT:    addpd %xmm11, %xmm4
; SSE-NEXT:    mulpd %xmm2, %xmm14
; SSE-NEXT:    addpd %xmm15, %xmm14
; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1,1]
; SSE-NEXT:    movapd %xmm7, %xmm11
; SSE-NEXT:    mulpd %xmm13, %xmm11
; SSE-NEXT:    addpd %xmm14, %xmm11
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm15, %xmm13
; SSE-NEXT:    addpd %xmm4, %xmm13
; SSE-NEXT:    movapd %xmm14, %xmm4
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0]
; SSE-NEXT:    mulpd %xmm4, %xmm1
; SSE-NEXT:    mulpd %xmm6, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm14 = xmm14[1,1]
; SSE-NEXT:    mulpd %xmm14, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm14
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm4, %xmm14
; SSE-NEXT:    movapd %xmm0, %xmm1
; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm14, %xmm5
; SSE-NEXT:    mulpd %xmm2, %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    mulpd %xmm0, %xmm7
; SSE-NEXT:    addpd %xmm1, %xmm7
; SSE-NEXT:    mulpd %xmm15, %xmm0
; SSE-NEXT:    addpd %xmm5, %xmm0
; SSE-NEXT:    movapd %xmm7, 112(%rdi)
; SSE-NEXT:    movapd %xmm0, 96(%rdi)
; SSE-NEXT:    movapd %xmm11, 80(%rdi)
; SSE-NEXT:    movapd %xmm13, 64(%rdi)
; SSE-NEXT:    movapd %xmm12, 48(%rdi)
; SSE-NEXT:    movapd %xmm9, 32(%rdi)
; SSE-NEXT:    movapd %xmm10, 16(%rdi)
; SSE-NEXT:    movapd %xmm8, (%rdi)
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul4x4_f64:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm4[0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm4[1,1]
; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm4[0,0,2,2]
; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} ymm4 = ymm4[1,1,3,3]
; AVX1-NEXT:    vmulpd %ymm4, %ymm3, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm8, %ymm4
; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm5[0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm5[1,1]
; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm5[0,0,2,2]
; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} ymm5 = ymm5[1,1,3,3]
; AVX1-NEXT:    vmulpd %ymm5, %ymm3, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm8, %ymm5
; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm6[0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} xmm9 = xmm6[1,1]
; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm9, %ymm9
; AVX1-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
; AVX1-NEXT:    vmovddup {{.*#+}} ymm9 = ymm6[0,0,2,2]
; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX1-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vshufpd {{.*#+}} ymm6 = ymm6[1,1,3,3]
; AVX1-NEXT:    vmulpd %ymm6, %ymm3, %ymm6
; AVX1-NEXT:    vaddpd %ymm6, %ymm8, %ymm6
; AVX1-NEXT:    vmovddup {{.*#+}} xmm8 = xmm7[0,0]
; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
; AVX1-NEXT:    vshufpd {{.*#+}} xmm8 = xmm7[1,1]
; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm8, %ymm8
; AVX1-NEXT:    vmulpd %ymm1, %ymm8, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,2,3]
; AVX1-NEXT:    vmovddup {{.*#+}} ymm7 = ymm1[0,0,2,2]
; AVX1-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1,1,3,3]
; AVX1-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm3
; AVX1-NEXT:    vmovapd %ymm4, %ymm0
; AVX1-NEXT:    vmovapd %ymm5, %ymm1
; AVX1-NEXT:    vmovapd %ymm6, %ymm2
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul4x4_f64:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    vbroadcastsd %xmm4, %ymm8
; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm4[1,1,1,1]
; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm4[2,2,2,2]
; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
; AVX2-NEXT:    vmulpd %ymm4, %ymm3, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm8, %ymm4
; AVX2-NEXT:    vbroadcastsd %xmm5, %ymm8
; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm5[1,1,1,1]
; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm5[2,2,2,2]
; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
; AVX2-NEXT:    vmulpd %ymm5, %ymm3, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm8, %ymm5
; AVX2-NEXT:    vbroadcastsd %xmm6, %ymm8
; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm6[1,1,1,1]
; AVX2-NEXT:    vmulpd %ymm1, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm9 = ymm6[2,2,2,2]
; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm9
; AVX2-NEXT:    vaddpd %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
; AVX2-NEXT:    vmulpd %ymm6, %ymm3, %ymm6
; AVX2-NEXT:    vaddpd %ymm6, %ymm8, %ymm6
; AVX2-NEXT:    vbroadcastsd %xmm7, %ymm8
; AVX2-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
; AVX2-NEXT:    vpermpd {{.*#+}} ymm8 = ymm7[1,1,1,1]
; AVX2-NEXT:    vmulpd %ymm1, %ymm8, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm7[2,2,2,2]
; AVX2-NEXT:    vmulpd %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm7[3,3,3,3]
; AVX2-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm3
; AVX2-NEXT:    vmovapd %ymm4, %ymm0
; AVX2-NEXT:    vmovapd %ymm5, %ymm1
; AVX2-NEXT:    vmovapd %ymm6, %ymm2
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: test_mul4x4_f64:
; AVX512F:       # %bb.0: # %entry
; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT:    vextractf64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT:    vbroadcastsd %xmm2, %ymm6
; AVX512F-NEXT:    vmulpd %ymm6, %ymm0, %ymm6
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
; AVX512F-NEXT:    vmulpd %ymm7, %ymm5, %ymm7
; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
; AVX512F-NEXT:    vmulpd %ymm7, %ymm1, %ymm7
; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
; AVX512F-NEXT:    vmulpd %ymm7, %ymm4, %ymm7
; AVX512F-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512F-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
; AVX512F-NEXT:    vbroadcastsd %xmm2, %ymm7
; AVX512F-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
; AVX512F-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
; AVX512F-NEXT:    vmulpd %ymm2, %ymm4, %ymm2
; AVX512F-NEXT:    vaddpd %ymm2, %ymm7, %ymm2
; AVX512F-NEXT:    vbroadcastsd %xmm3, %ymm7
; AVX512F-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
; AVX512F-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
; AVX512F-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
; AVX512F-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512F-NEXT:    vextractf64x4 $1, %zmm3, %ymm3
; AVX512F-NEXT:    vbroadcastsd %xmm3, %ymm8
; AVX512F-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
; AVX512F-NEXT:    vmulpd %ymm5, %ymm8, %ymm5
; AVX512F-NEXT:    vaddpd %ymm5, %ymm0, %ymm0
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[2,2,2,2]
; AVX512F-NEXT:    vmulpd %ymm5, %ymm1, %ymm1
; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
; AVX512F-NEXT:    vmulpd %ymm1, %ymm4, %ymm1
; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm1
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
; AVX512F-NEXT:    retq
;
; AVX512VL-LABEL: test_mul4x4_f64:
; AVX512VL:       # %bb.0: # %entry
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm4
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm1, %ymm5
; AVX512VL-NEXT:    vbroadcastsd %xmm2, %ymm6
; AVX512VL-NEXT:    vmulpd %ymm6, %ymm0, %ymm6
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
; AVX512VL-NEXT:    vmulpd %ymm7, %ymm4, %ymm7
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
; AVX512VL-NEXT:    vmulpd %ymm7, %ymm1, %ymm7
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
; AVX512VL-NEXT:    vmulpd %ymm7, %ymm5, %ymm7
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm2, %ymm2
; AVX512VL-NEXT:    vbroadcastsd %xmm2, %ymm7
; AVX512VL-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
; AVX512VL-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
; AVX512VL-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
; AVX512VL-NEXT:    vaddpd %ymm2, %ymm7, %ymm2
; AVX512VL-NEXT:    vbroadcastsd %xmm3, %ymm7
; AVX512VL-NEXT:    vmulpd %ymm7, %ymm0, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm8
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
; AVX512VL-NEXT:    vmulpd %ymm1, %ymm8, %ymm8
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
; AVX512VL-NEXT:    vmulpd %ymm5, %ymm8, %ymm8
; AVX512VL-NEXT:    vaddpd %ymm7, %ymm8, %ymm7
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm3, %ymm3
; AVX512VL-NEXT:    vbroadcastsd %xmm3, %ymm8
; AVX512VL-NEXT:    vmulpd %ymm0, %ymm8, %ymm0
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
; AVX512VL-NEXT:    vmulpd %ymm4, %ymm8, %ymm4
; AVX512VL-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm4 = ymm3[2,2,2,2]
; AVX512VL-NEXT:    vmulpd %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
; AVX512VL-NEXT:    vmulpd %ymm1, %ymm5, %ymm1
; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
; AVX512VL-NEXT:    retq
entry:
  %split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %split2 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
  %split3 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
  %splat.splat = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> zeroinitializer
  %0 = fmul <4 x double> %split, %splat.splat
  %splat.splat10 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  %1 = fmul <4 x double> %split1, %splat.splat10
  %2 = fadd <4 x double> %0, %1
  %splat.splat13 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
  %3 = fmul <4 x double> %split2, %splat.splat13
  %4 = fadd <4 x double> %2, %3
  %splat.splat16 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  %5 = fmul <4 x double> %split3, %splat.splat16
  %6 = fadd <4 x double> %4, %5
  %splat.splat19 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
  %7 = fmul <4 x double> %split, %splat.splat19
  %splat.splat22 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
  %8 = fmul <4 x double> %split1, %splat.splat22
  %9 = fadd <4 x double> %7, %8
  %splat.splat25 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
  %10 = fmul <4 x double> %split2, %splat.splat25
  %11 = fadd <4 x double> %9, %10
  %splat.splat28 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
  %12 = fmul <4 x double> %split3, %splat.splat28
  %13 = fadd <4 x double> %11, %12
  %splat.splat31 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 8, i32 8, i32 8, i32 8>
  %14 = fmul <4 x double> %split, %splat.splat31
  %splat.splat34 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
  %15 = fmul <4 x double> %split1, %splat.splat34
  %16 = fadd <4 x double> %14, %15
  %splat.splat37 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 10, i32 10, i32 10, i32 10>
  %17 = fmul <4 x double> %split2, %splat.splat37
  %18 = fadd <4 x double> %16, %17
  %splat.splat40 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
  %19 = fmul <4 x double> %split3, %splat.splat40
  %20 = fadd <4 x double> %18, %19
  %splat.splat43 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 12, i32 12, i32 12, i32 12>
  %21 = fmul <4 x double> %split, %splat.splat43
  %splat.splat46 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 13, i32 13, i32 13, i32 13>
  %22 = fmul <4 x double> %split1, %splat.splat46
  %23 = fadd <4 x double> %21, %22
  %splat.splat49 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 14, i32 14, i32 14, i32 14>
  %24 = fmul <4 x double> %split2, %splat.splat49
  %25 = fadd <4 x double> %23, %24
  %splat.splat52 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> <i32 15, i32 15, i32 15, i32 15>
  %26 = fmul <4 x double> %split3, %splat.splat52
  %27 = fadd <4 x double> %25, %26
  %28 = shufflevector <4 x double> %6, <4 x double> %13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %29 = shufflevector <4 x double> %20, <4 x double> %27, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %30 = shufflevector <8 x double> %28, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x double> %30
}

define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind {
; SSE-LABEL: test_mul8x8_f32:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    subq $120, %rsp
; SSE-NEXT:    movaps %xmm5, %xmm11
; SSE-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps %xmm1, %xmm9
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    movaps %xmm14, %xmm15
; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0]
; SSE-NEXT:    movaps %xmm1, %xmm5
; SSE-NEXT:    mulps %xmm15, %xmm5
; SSE-NEXT:    mulps %xmm0, %xmm15
; SSE-NEXT:    movaps %xmm14, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1]
; SSE-NEXT:    movaps %xmm3, %xmm10
; SSE-NEXT:    movaps %xmm3, %xmm12
; SSE-NEXT:    mulps %xmm0, %xmm10
; SSE-NEXT:    addps %xmm5, %xmm10
; SSE-NEXT:    mulps %xmm2, %xmm0
; SSE-NEXT:    addps %xmm15, %xmm0
; SSE-NEXT:    movaps %xmm14, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2]
; SSE-NEXT:    movaps %xmm4, %xmm2
; SSE-NEXT:    movaps %xmm4, %xmm15
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulps %xmm11, %xmm1
; SSE-NEXT:    addps %xmm10, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[3,3,3,3]
; SSE-NEXT:    movaps %xmm7, %xmm3
; SSE-NEXT:    mulps %xmm14, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    mulps %xmm6, %xmm14
; SSE-NEXT:    addps %xmm2, %xmm14
; SSE-NEXT:    movaps %xmm5, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0]
; SSE-NEXT:    movaps %xmm13, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm14, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    movaps %xmm5, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1]
; SSE-NEXT:    movaps %xmm8, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps %xmm5, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm5, %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    addps %xmm2, %xmm5
; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm3
; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
; SSE-NEXT:    movaps %xmm12, %xmm4
; SSE-NEXT:    mulps %xmm3, %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT:    mulps %xmm5, %xmm1
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    mulps %xmm13, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps %xmm15, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm3, %xmm2
; SSE-NEXT:    movaps %xmm11, %xmm8
; SSE-NEXT:    mulps %xmm11, %xmm1
; SSE-NEXT:    addps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps %xmm7, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulps %xmm6, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    movaps %xmm14, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    movaps %xmm4, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    movaps %xmm11, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulps %xmm1, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm4, %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm3
; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
; SSE-NEXT:    movaps %xmm12, %xmm4
; SSE-NEXT:    mulps %xmm3, %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    mulps %xmm5, %xmm1
; SSE-NEXT:    movaps %xmm5, %xmm10
; SSE-NEXT:    mulps %xmm13, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps %xmm15, %xmm2
; SSE-NEXT:    movaps %xmm15, %xmm5
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm3, %xmm2
; SSE-NEXT:    mulps %xmm8, %xmm1
; SSE-NEXT:    addps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps %xmm7, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    mulps %xmm6, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
; SSE-NEXT:    movaps %xmm14, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulps %xmm14, %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    movaps %xmm4, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
; SSE-NEXT:    movaps %xmm11, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    mulps %xmm11, %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm4, %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm0, %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm3
; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
; SSE-NEXT:    movaps %xmm12, %xmm4
; SSE-NEXT:    mulps %xmm3, %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    movaps %xmm10, %xmm15
; SSE-NEXT:    mulps %xmm10, %xmm1
; SSE-NEXT:    mulps %xmm13, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps %xmm5, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm3, %xmm2
; SSE-NEXT:    mulps %xmm8, %xmm1
; SSE-NEXT:    addps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps %xmm7, %xmm4
; SSE-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps %xmm7, %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    mulps %xmm6, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    movaps %xmm10, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm10[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps %xmm14, %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    movaps %xmm10, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulps %xmm0, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps %xmm10, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps %xmm11, %xmm1
; SSE-NEXT:    addps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    movaps %xmm11, %xmm0
; SSE-NEXT:    mulps %xmm10, %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    addps %xmm2, %xmm10
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    movaps %xmm9, %xmm14
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm3
; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
; SSE-NEXT:    movaps %xmm12, %xmm7
; SSE-NEXT:    mulps %xmm3, %xmm7
; SSE-NEXT:    addps %xmm2, %xmm7
; SSE-NEXT:    mulps %xmm15, %xmm1
; SSE-NEXT:    mulps %xmm13, %xmm3
; SSE-NEXT:    addps %xmm1, %xmm3
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps %xmm5, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm3, %xmm2
; SSE-NEXT:    movaps %xmm8, %xmm9
; SSE-NEXT:    mulps %xmm8, %xmm1
; SSE-NEXT:    addps %xmm7, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps %xmm4, %xmm7
; SSE-NEXT:    mulps %xmm0, %xmm7
; SSE-NEXT:    addps %xmm1, %xmm7
; SSE-NEXT:    movaps %xmm6, %xmm3
; SSE-NEXT:    mulps %xmm6, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm7, %xmm1
; SSE-NEXT:    movaps %xmm4, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulps %xmm0, %xmm7
; SSE-NEXT:    addps %xmm1, %xmm7
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulps %xmm1, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps %xmm4, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm0, %xmm2
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm7, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; SSE-NEXT:    movaps %xmm11, %xmm0
; SSE-NEXT:    mulps %xmm4, %xmm0
; SSE-NEXT:    addps %xmm1, %xmm0
; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    addps %xmm2, %xmm4
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    movaps %xmm14, %xmm6
; SSE-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movaps %xmm14, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm14
; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1]
; SSE-NEXT:    movaps %xmm12, %xmm15
; SSE-NEXT:    movaps %xmm12, %xmm13
; SSE-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulps %xmm14, %xmm15
; SSE-NEXT:    addps %xmm2, %xmm15
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; SSE-NEXT:    mulps %xmm8, %xmm1
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; SSE-NEXT:    mulps %xmm7, %xmm14
; SSE-NEXT:    addps %xmm1, %xmm14
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps %xmm5, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm14, %xmm2
; SSE-NEXT:    mulps %xmm9, %xmm1
; SSE-NEXT:    movaps %xmm9, %xmm11
; SSE-NEXT:    addps %xmm15, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT:    movaps %xmm5, %xmm14
; SSE-NEXT:    mulps %xmm0, %xmm14
; SSE-NEXT:    addps %xmm1, %xmm14
; SSE-NEXT:    mulps %xmm3, %xmm0
; SSE-NEXT:    movaps %xmm3, %xmm12
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    movaps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    mulps %xmm1, %xmm15
; SSE-NEXT:    addps %xmm0, %xmm15
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm0, %xmm1
; SSE-NEXT:    addps %xmm14, %xmm1
; SSE-NEXT:    movaps %xmm3, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulps %xmm0, %xmm14
; SSE-NEXT:    addps %xmm1, %xmm14
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm15, %xmm0
; SSE-NEXT:    movaps %xmm3, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    mulps %xmm1, %xmm15
; SSE-NEXT:    addps %xmm0, %xmm15
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm14, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulps %xmm3, %xmm14
; SSE-NEXT:    addps %xmm1, %xmm14
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    addps %xmm15, %xmm3
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT:    mulps %xmm1, %xmm6
; SSE-NEXT:    movaps %xmm0, %xmm15
; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1]
; SSE-NEXT:    mulps %xmm15, %xmm13
; SSE-NEXT:    addps %xmm6, %xmm13
; SSE-NEXT:    mulps %xmm8, %xmm1
; SSE-NEXT:    mulps %xmm7, %xmm15
; SSE-NEXT:    addps %xmm1, %xmm15
; SSE-NEXT:    movaps %xmm0, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2]
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    movaps %xmm6, %xmm2
; SSE-NEXT:    mulps %xmm1, %xmm2
; SSE-NEXT:    addps %xmm15, %xmm2
; SSE-NEXT:    mulps %xmm9, %xmm1
; SSE-NEXT:    addps %xmm13, %xmm1
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps %xmm5, %xmm9
; SSE-NEXT:    mulps %xmm0, %xmm9
; SSE-NEXT:    addps %xmm1, %xmm9
; SSE-NEXT:    mulps %xmm12, %xmm0
; SSE-NEXT:    movaps %xmm12, %xmm5
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movaps %xmm1, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    mulps %xmm2, %xmm15
; SSE-NEXT:    addps %xmm0, %xmm15
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm0, %xmm2
; SSE-NEXT:    addps %xmm9, %xmm2
; SSE-NEXT:    movaps %xmm1, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulps %xmm0, %xmm9
; SSE-NEXT:    addps %xmm2, %xmm9
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm15, %xmm0
; SSE-NEXT:    movaps %xmm1, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulps %xmm2, %xmm13
; SSE-NEXT:    addps %xmm0, %xmm13
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    addps %xmm9, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    mulps %xmm1, %xmm15
; SSE-NEXT:    addps %xmm2, %xmm15
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addps %xmm13, %xmm1
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movaps %xmm0, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    mulps %xmm2, %xmm13
; SSE-NEXT:    mulps %xmm8, %xmm2
; SSE-NEXT:    movaps %xmm0, %xmm9
; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1]
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; SSE-NEXT:    mulps %xmm9, %xmm8
; SSE-NEXT:    addps %xmm13, %xmm8
; SSE-NEXT:    mulps %xmm7, %xmm9
; SSE-NEXT:    addps %xmm2, %xmm9
; SSE-NEXT:    movaps %xmm0, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2]
; SSE-NEXT:    mulps %xmm2, %xmm6
; SSE-NEXT:    addps %xmm9, %xmm6
; SSE-NEXT:    mulps %xmm11, %xmm2
; SSE-NEXT:    addps %xmm8, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; SSE-NEXT:    mulps %xmm0, %xmm9
; SSE-NEXT:    addps %xmm2, %xmm9
; SSE-NEXT:    movaps %xmm9, %xmm12
; SSE-NEXT:    mulps %xmm5, %xmm0
; SSE-NEXT:    addps %xmm6, %xmm0
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulps %xmm2, %xmm13
; SSE-NEXT:    addps %xmm0, %xmm13
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    addps %xmm12, %xmm2
; SSE-NEXT:    movaps %xmm9, %xmm0
; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulps %xmm0, %xmm12
; SSE-NEXT:    addps %xmm2, %xmm12
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addps %xmm13, %xmm0
; SSE-NEXT:    movaps %xmm9, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,2],xmm9[2,2]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulps %xmm2, %xmm5
; SSE-NEXT:    addps %xmm0, %xmm5
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    addps %xmm12, %xmm2
; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,3,3,3]
; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulps %xmm9, %xmm0
; SSE-NEXT:    addps %xmm2, %xmm0
; SSE-NEXT:    mulps {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    addps %xmm5, %xmm9
; SSE-NEXT:    movaps %xmm0, 240(%rdi)
; SSE-NEXT:    movaps %xmm9, 224(%rdi)
; SSE-NEXT:    movaps %xmm15, 208(%rdi)
; SSE-NEXT:    movaps %xmm1, 192(%rdi)
; SSE-NEXT:    movaps %xmm14, 176(%rdi)
; SSE-NEXT:    movaps %xmm3, 160(%rdi)
; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 144(%rdi)
; SSE-NEXT:    movaps %xmm4, 128(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 112(%rdi)
; SSE-NEXT:    movaps %xmm10, 96(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 80(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 64(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 48(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 32(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 16(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, (%rdi)
; SSE-NEXT:    addq $120, %rsp
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul8x8_f32:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    pushq %rbp
; AVX1-NEXT:    movq %rsp, %rbp
; AVX1-NEXT:    andq $-32, %rsp
; AVX1-NEXT:    subq $32, %rsp
; AVX1-NEXT:    movq %rdi, %rax
; AVX1-NEXT:    vbroadcastss 16(%rbp), %ymm8
; AVX1-NEXT:    vmulps %ymm0, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 20(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm1, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 24(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm2, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 28(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm3, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 32(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm4, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 36(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm5, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 40(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm6, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 44(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm7, %ymm9, %ymm9
; AVX1-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX1-NEXT:    vbroadcastss 48(%rbp), %ymm9
; AVX1-NEXT:    vmulps %ymm0, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 52(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm1, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 56(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm2, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 60(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm3, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 64(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm4, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 68(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm5, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 72(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm6, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 76(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm7, %ymm10, %ymm10
; AVX1-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX1-NEXT:    vbroadcastss 80(%rbp), %ymm10
; AVX1-NEXT:    vmulps %ymm0, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 84(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm1, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 88(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm2, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 92(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm3, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 96(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm4, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 100(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm5, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 104(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm6, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 108(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm7, %ymm11, %ymm11
; AVX1-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX1-NEXT:    vbroadcastss 112(%rbp), %ymm11
; AVX1-NEXT:    vmulps %ymm0, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 116(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm1, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 120(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm2, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 124(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm3, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 128(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm4, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 132(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm5, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 136(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm6, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 140(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm7, %ymm12, %ymm12
; AVX1-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX1-NEXT:    vbroadcastss 144(%rbp), %ymm12
; AVX1-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 148(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm1, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 152(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm2, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 156(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm3, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 160(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm4, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 164(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm5, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 168(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm6, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 172(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm7, %ymm13, %ymm13
; AVX1-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX1-NEXT:    vbroadcastss 176(%rbp), %ymm13
; AVX1-NEXT:    vmulps %ymm0, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 180(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm1, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 184(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm2, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 188(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 192(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm4, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 196(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm5, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 200(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm6, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 204(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm7, %ymm14, %ymm14
; AVX1-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX1-NEXT:    vbroadcastss 208(%rbp), %ymm14
; AVX1-NEXT:    vmulps %ymm0, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 212(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 216(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm2, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 220(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm3, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 224(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm4, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 228(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm5, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 232(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm6, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 236(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm7, %ymm15, %ymm15
; AVX1-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX1-NEXT:    vbroadcastss 240(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm0, %ymm15, %ymm0
; AVX1-NEXT:    vbroadcastss 244(%rbp), %ymm15
; AVX1-NEXT:    vmulps %ymm1, %ymm15, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 248(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm2, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 252(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm3, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 256(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm4, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 260(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm5, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 264(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm6, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastss 268(%rbp), %ymm1
; AVX1-NEXT:    vmulps %ymm1, %ymm7, %ymm1
; AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovaps %ymm0, 224(%rdi)
; AVX1-NEXT:    vmovaps %ymm14, 192(%rdi)
; AVX1-NEXT:    vmovaps %ymm13, 160(%rdi)
; AVX1-NEXT:    vmovaps %ymm12, 128(%rdi)
; AVX1-NEXT:    vmovaps %ymm11, 96(%rdi)
; AVX1-NEXT:    vmovaps %ymm10, 64(%rdi)
; AVX1-NEXT:    vmovaps %ymm9, 32(%rdi)
; AVX1-NEXT:    vmovaps %ymm8, (%rdi)
; AVX1-NEXT:    movq %rbp, %rsp
; AVX1-NEXT:    popq %rbp
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul8x8_f32:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    pushq %rbp
; AVX2-NEXT:    movq %rsp, %rbp
; AVX2-NEXT:    andq $-32, %rsp
; AVX2-NEXT:    subq $32, %rsp
; AVX2-NEXT:    movq %rdi, %rax
; AVX2-NEXT:    vbroadcastss 16(%rbp), %ymm8
; AVX2-NEXT:    vmulps %ymm0, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 20(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm1, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 24(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm2, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 28(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm3, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 32(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm4, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 36(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm5, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 40(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm6, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 44(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm7, %ymm9, %ymm9
; AVX2-NEXT:    vaddps %ymm9, %ymm8, %ymm8
; AVX2-NEXT:    vbroadcastss 48(%rbp), %ymm9
; AVX2-NEXT:    vmulps %ymm0, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 52(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm1, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 56(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm2, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 60(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm3, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 64(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm4, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 68(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm5, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 72(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm6, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 76(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm7, %ymm10, %ymm10
; AVX2-NEXT:    vaddps %ymm10, %ymm9, %ymm9
; AVX2-NEXT:    vbroadcastss 80(%rbp), %ymm10
; AVX2-NEXT:    vmulps %ymm0, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 84(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm1, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 88(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm2, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 92(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm3, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 96(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm4, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 100(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm5, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 104(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm6, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 108(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm7, %ymm11, %ymm11
; AVX2-NEXT:    vaddps %ymm11, %ymm10, %ymm10
; AVX2-NEXT:    vbroadcastss 112(%rbp), %ymm11
; AVX2-NEXT:    vmulps %ymm0, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 116(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm1, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 120(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm2, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 124(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm3, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 128(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm4, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 132(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm5, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 136(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm6, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 140(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm7, %ymm12, %ymm12
; AVX2-NEXT:    vaddps %ymm12, %ymm11, %ymm11
; AVX2-NEXT:    vbroadcastss 144(%rbp), %ymm12
; AVX2-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 148(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm1, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 152(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm2, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 156(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm3, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 160(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm4, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 164(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm5, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 168(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm6, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 172(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm7, %ymm13, %ymm13
; AVX2-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX2-NEXT:    vbroadcastss 176(%rbp), %ymm13
; AVX2-NEXT:    vmulps %ymm0, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 180(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm1, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 184(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm2, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 188(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 192(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm4, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 196(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm5, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 200(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm6, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 204(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm7, %ymm14, %ymm14
; AVX2-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX2-NEXT:    vbroadcastss 208(%rbp), %ymm14
; AVX2-NEXT:    vmulps %ymm0, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 212(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 216(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm2, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 220(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm3, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 224(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm4, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 228(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm5, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 232(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm6, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 236(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm7, %ymm15, %ymm15
; AVX2-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX2-NEXT:    vbroadcastss 240(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm0, %ymm15, %ymm0
; AVX2-NEXT:    vbroadcastss 244(%rbp), %ymm15
; AVX2-NEXT:    vmulps %ymm1, %ymm15, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 248(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 252(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm3, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 256(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm4, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 260(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm5, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 264(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm6, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastss 268(%rbp), %ymm1
; AVX2-NEXT:    vmulps %ymm1, %ymm7, %ymm1
; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovaps %ymm0, 224(%rdi)
; AVX2-NEXT:    vmovaps %ymm14, 192(%rdi)
; AVX2-NEXT:    vmovaps %ymm13, 160(%rdi)
; AVX2-NEXT:    vmovaps %ymm12, 128(%rdi)
; AVX2-NEXT:    vmovaps %ymm11, 96(%rdi)
; AVX2-NEXT:    vmovaps %ymm10, 64(%rdi)
; AVX2-NEXT:    vmovaps %ymm9, 32(%rdi)
; AVX2-NEXT:    vmovaps %ymm8, (%rdi)
; AVX2-NEXT:    movq %rbp, %rsp
; AVX2-NEXT:    popq %rbp
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512F-LABEL: test_mul8x8_f32:
; AVX512F:       # %bb.0: # %entry
; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm11
; AVX512F-NEXT:    vextractf64x4 $1, %zmm1, %ymm10
; AVX512F-NEXT:    vextractf64x4 $1, %zmm2, %ymm9
; AVX512F-NEXT:    vextractf64x4 $1, %zmm3, %ymm8
; AVX512F-NEXT:    vbroadcastss %xmm4, %ymm12
; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512F-NEXT:    vmulps %ymm13, %ymm11, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512F-NEXT:    vmulps %ymm1, %ymm13, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512F-NEXT:    vmulps %ymm13, %ymm10, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vextractf128 $1, %ymm4, %xmm13
; AVX512F-NEXT:    vbroadcastss %xmm13, %ymm13
; AVX512F-NEXT:    vmulps %ymm2, %ymm13, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm13, %ymm9, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm13, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm13, %ymm8, %ymm13
; AVX512F-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512F-NEXT:    vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-NEXT:    vextractf64x4 $1, %zmm4, %ymm13
; AVX512F-NEXT:    vextractf32x4 $2, %zmm4, %xmm14
; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm0, %ymm14, %ymm14
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512F-NEXT:    vextractf32x4 $3, %zmm4, %xmm4
; AVX512F-NEXT:    vbroadcastss %xmm4, %ymm4
; AVX512F-NEXT:    vmulps %ymm4, %ymm2, %ymm4
; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm13, %ymm8, %ymm13
; AVX512F-NEXT:    vaddps %ymm4, %ymm13, %ymm4
; AVX512F-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512F-NEXT:    vbroadcastss %xmm5, %ymm13
; AVX512F-NEXT:    vmulps %ymm0, %ymm13, %ymm13
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm14, %ymm11, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm1, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm14, %ymm10, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vextractf128 $1, %ymm5, %xmm14
; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm2, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm14, %ymm8, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512F-NEXT:    vextractf64x4 $1, %zmm5, %ymm14
; AVX512F-NEXT:    vextractf32x4 $2, %zmm5, %xmm15
; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm0, %ymm15, %ymm15
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm12 = xmm14[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT:    vmulps %ymm12, %ymm11, %ymm12
; AVX512F-NEXT:    vaddps %ymm12, %ymm15, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm14[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vextractf32x4 $3, %zmm5, %xmm5
; AVX512F-NEXT:    vbroadcastss %xmm5, %ymm5
; AVX512F-NEXT:    vmulps %ymm5, %ymm2, %ymm5
; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm12 = ymm14[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm12, %ymm9, %ymm12
; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
; AVX512F-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm12, %ymm12
; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
; AVX512F-NEXT:    vshufps {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm12, %ymm8, %ymm12
; AVX512F-NEXT:    vaddps %ymm5, %ymm12, %ymm5
; AVX512F-NEXT:    vbroadcastss %xmm6, %ymm12
; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm14, %ymm11, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm1, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm14, %ymm10, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vextractf128 $1, %ymm6, %xmm14
; AVX512F-NEXT:    vbroadcastss %xmm14, %ymm14
; AVX512F-NEXT:    vmulps %ymm2, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm6[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm14, %ymm9, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm14, %ymm8, %ymm14
; AVX512F-NEXT:    vaddps %ymm14, %ymm12, %ymm14
; AVX512F-NEXT:    vextractf32x4 $2, %zmm6, %xmm12
; AVX512F-NEXT:    vbroadcastss %xmm12, %ymm12
; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT:    vextractf64x4 $1, %zmm6, %ymm15
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm4, %ymm4
; AVX512F-NEXT:    vmulps %ymm4, %ymm11, %ymm4
; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT:    vmulps %ymm1, %ymm12, %ymm12
; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} xmm12 = xmm15[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm12, %ymm12
; AVX512F-NEXT:    vmulps %ymm12, %ymm10, %ymm12
; AVX512F-NEXT:    vaddps %ymm4, %ymm12, %ymm4
; AVX512F-NEXT:    vextractf32x4 $3, %zmm6, %xmm6
; AVX512F-NEXT:    vbroadcastss %xmm6, %ymm6
; AVX512F-NEXT:    vmulps %ymm6, %ymm2, %ymm6
; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm6 = ymm15[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm6, %ymm9, %ymm6
; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm6, %ymm3, %ymm6
; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm4
; AVX512F-NEXT:    vshufps {{.*#+}} ymm6 = ymm15[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm6, %ymm8, %ymm6
; AVX512F-NEXT:    vaddps %ymm6, %ymm4, %ymm6
; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512F-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
; AVX512F-NEXT:    vbroadcastss %xmm7, %ymm12
; AVX512F-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm10, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vextractf128 $1, %ymm7, %xmm15
; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm2, %ymm15, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm7[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm15, %ymm9, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm15 = ymm7[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm3, %ymm15, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vshufps {{.*#+}} ymm15 = ymm7[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm15, %ymm8, %ymm15
; AVX512F-NEXT:    vaddps %ymm15, %ymm12, %ymm12
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm5, %zmm13, %zmm5
; AVX512F-NEXT:    vextractf64x4 $1, %zmm7, %ymm13
; AVX512F-NEXT:    vextractf32x4 $2, %zmm7, %xmm15
; AVX512F-NEXT:    vbroadcastss %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm0, %ymm15, %ymm0
; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512F-NEXT:    vmulps %ymm15, %ymm11, %ymm11
; AVX512F-NEXT:    vaddps %ymm0, %ymm11, %ymm0
; AVX512F-NEXT:    vshufps {{.*#+}} xmm11 = xmm13[2,2,2,2]
; AVX512F-NEXT:    vbroadcastsd %xmm11, %ymm11
; AVX512F-NEXT:    vmulps %ymm1, %ymm11, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vshufps {{.*#+}} xmm1 = xmm13[3,3,3,3]
; AVX512F-NEXT:    vbroadcastsd %xmm1, %ymm1
; AVX512F-NEXT:    vmulps %ymm1, %ymm10, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
; AVX512F-NEXT:    vbroadcastss %xmm1, %ymm1
; AVX512F-NEXT:    vmulps %ymm1, %ymm2, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm1, %ymm9, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[2,2,2,2,6,6,6,6]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm1, %ymm3, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7]
; AVX512F-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512F-NEXT:    vmulps %ymm1, %ymm8, %ymm1
; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
; AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm12, %zmm3
; AVX512F-NEXT:    vmovaps %zmm4, %zmm0
; AVX512F-NEXT:    vmovaps %zmm5, %zmm1
; AVX512F-NEXT:    retq
;
; AVX512VL-LABEL: test_mul8x8_f32:
; AVX512VL:       # %bb.0: # %entry
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm11
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm1, %ymm10
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm2, %ymm9
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm3, %ymm8
; AVX512VL-NEXT:    vbroadcastss %xmm4, %ymm12
; AVX512VL-NEXT:    vmulps %ymm0, %ymm12, %ymm12
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512VL-NEXT:    vmulps %ymm13, %ymm11, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512VL-NEXT:    vmulps %ymm1, %ymm13, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm13, %ymm13
; AVX512VL-NEXT:    vmulps %ymm13, %ymm10, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vextractf128 $1, %ymm4, %xmm13
; AVX512VL-NEXT:    vbroadcastss %xmm13, %ymm13
; AVX512VL-NEXT:    vmulps %ymm2, %ymm13, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm13 = ymm4[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm13, %ymm9, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm3, %ymm13, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm4[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm13, %ymm8, %ymm13
; AVX512VL-NEXT:    vaddps %ymm13, %ymm12, %ymm12
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm4, %ymm13
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm4, %xmm14
; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
; AVX512VL-NEXT:    vmulps %ymm0, %ymm14, %ymm14
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm13[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm15, %ymm11, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm15, %ymm10, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm4, %xmm4
; AVX512VL-NEXT:    vbroadcastss %xmm4, %ymm4
; AVX512VL-NEXT:    vmulps %ymm4, %ymm2, %ymm4
; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm13[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm14, %ymm9, %ymm14
; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm13[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX512VL-NEXT:    vaddps %ymm4, %ymm14, %ymm4
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm13, %ymm8, %ymm13
; AVX512VL-NEXT:    vaddps %ymm4, %ymm13, %ymm4
; AVX512VL-NEXT:    vbroadcastss %xmm5, %ymm13
; AVX512VL-NEXT:    vmulps %ymm0, %ymm13, %ymm13
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512VL-NEXT:    vmulps %ymm14, %ymm11, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512VL-NEXT:    vmulps %ymm1, %ymm14, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm14, %ymm14
; AVX512VL-NEXT:    vmulps %ymm14, %ymm10, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vextractf128 $1, %ymm5, %xmm14
; AVX512VL-NEXT:    vbroadcastss %xmm14, %ymm14
; AVX512VL-NEXT:    vmulps %ymm2, %ymm14, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm14 = ymm5[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm14, %ymm9, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm3, %ymm14, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm5[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm14, %ymm8, %ymm14
; AVX512VL-NEXT:    vaddps %ymm14, %ymm13, %ymm13
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm5, %ymm14
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm5, %xmm15
; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm0, %ymm15, %ymm15
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm16 = xmm14[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm11, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm14[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm1, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm14[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm10, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm5, %xmm5
; AVX512VL-NEXT:    vbroadcastss %xmm5, %ymm5
; AVX512VL-NEXT:    vmulps %ymm5, %ymm2, %ymm5
; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm14[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm15, %ymm9, %ymm15
; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm3, %ymm15, %ymm15
; AVX512VL-NEXT:    vaddps %ymm5, %ymm15, %ymm5
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm14 = ymm14[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm14, %ymm8, %ymm14
; AVX512VL-NEXT:    vaddps %ymm5, %ymm14, %ymm5
; AVX512VL-NEXT:    vbroadcastss %xmm6, %ymm14
; AVX512VL-NEXT:    vmulps %ymm0, %ymm14, %ymm14
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm15, %ymm11, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm1, %ymm15, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm15, %ymm10, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vextractf128 $1, %ymm6, %xmm15
; AVX512VL-NEXT:    vbroadcastss %xmm15, %ymm15
; AVX512VL-NEXT:    vmulps %ymm2, %ymm15, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm15 = ymm6[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm15, %ymm9, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm3, %ymm15, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm15, %ymm8, %ymm15
; AVX512VL-NEXT:    vaddps %ymm15, %ymm14, %ymm14
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm6, %ymm15
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm6, %xmm16
; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm0, %ymm16
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm17 = xmm15[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
; AVX512VL-NEXT:    vmulps %ymm17, %ymm11, %ymm17
; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm17 = xmm15[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
; AVX512VL-NEXT:    vmulps %ymm17, %ymm1, %ymm17
; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm17 = xmm15[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
; AVX512VL-NEXT:    vmulps %ymm17, %ymm10, %ymm17
; AVX512VL-NEXT:    vaddps %ymm17, %ymm16, %ymm16
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm6, %xmm6
; AVX512VL-NEXT:    vbroadcastss %xmm6, %ymm6
; AVX512VL-NEXT:    vmulps %ymm6, %ymm2, %ymm6
; AVX512VL-NEXT:    vaddps %ymm6, %ymm16, %ymm6
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm16 = ymm15[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm16, %ymm9, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm6, %ymm6
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm15[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm16, %ymm3, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm6, %ymm6
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm15 = ymm15[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm15, %ymm8, %ymm15
; AVX512VL-NEXT:    vaddps %ymm6, %ymm15, %ymm6
; AVX512VL-NEXT:    vbroadcastss %xmm7, %ymm15
; AVX512VL-NEXT:    vmulps %ymm0, %ymm15, %ymm15
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm11, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm7[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm1, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm10, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vextractf32x4 $1, %ymm7, %xmm16
; AVX512VL-NEXT:    vbroadcastss %xmm16, %ymm16
; AVX512VL-NEXT:    vmulps %ymm16, %ymm2, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm16 = ymm7[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm16, %ymm9, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm7[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm16, %ymm3, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm16 = ymm7[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm16 = ymm16[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm16, %ymm8, %ymm16
; AVX512VL-NEXT:    vaddps %ymm16, %ymm15, %ymm15
; AVX512VL-NEXT:    vextractf64x4 $1, %zmm7, %ymm16
; AVX512VL-NEXT:    vextractf32x4 $2, %zmm7, %xmm17
; AVX512VL-NEXT:    vbroadcastss %xmm17, %ymm17
; AVX512VL-NEXT:    vmulps %ymm17, %ymm0, %ymm0
; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm17 = xmm16[1,1,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm17, %ymm17
; AVX512VL-NEXT:    vmulps %ymm17, %ymm11, %ymm11
; AVX512VL-NEXT:    vaddps %ymm0, %ymm11, %ymm0
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm11 = xmm16[2,2,2,2]
; AVX512VL-NEXT:    vbroadcastsd %xmm11, %ymm11
; AVX512VL-NEXT:    vmulps %ymm1, %ymm11, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vshufps {{.*#+}} xmm1 = xmm16[3,3,3,3]
; AVX512VL-NEXT:    vbroadcastsd %xmm1, %ymm1
; AVX512VL-NEXT:    vmulps %ymm1, %ymm10, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
; AVX512VL-NEXT:    vbroadcastss %xmm1, %ymm1
; AVX512VL-NEXT:    vmulps %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vmovshdup {{.*#+}} ymm1 = ymm16[1,1,3,3,5,5,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm1, %ymm9, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm1 = ymm16[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm1, %ymm3, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT:    vshufps {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX512VL-NEXT:    vmulps %ymm1, %ymm8, %ymm1
; AVX512VL-NEXT:    vaddps %ymm1, %ymm0, %ymm3
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm4, %zmm12, %zmm0
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm5, %zmm13, %zmm1
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm6, %zmm14, %zmm2
; AVX512VL-NEXT:    vinsertf64x4 $1, %ymm3, %zmm15, %zmm3
; AVX512VL-NEXT:    retq
entry:
  %split = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %split1 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %split2 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
  %split3 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %split4 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
  %split5 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
  %split6 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
  %split7 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %splat.splat = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> zeroinitializer
  %0 = fmul <8 x float> %split, %splat.splat
  %splat.splat18 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %1 = fmul <8 x float> %split1, %splat.splat18
  %2 = fadd <8 x float> %0, %1
  %splat.splat21 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %3 = fmul <8 x float> %split2, %splat.splat21
  %4 = fadd <8 x float> %2, %3
  %splat.splat24 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %5 = fmul <8 x float> %split3, %splat.splat24
  %6 = fadd <8 x float> %4, %5
  %splat.splat27 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
  %7 = fmul <8 x float> %split4, %splat.splat27
  %8 = fadd <8 x float> %6, %7
  %splat.splat30 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  %9 = fmul <8 x float> %split5, %splat.splat30
  %10 = fadd <8 x float> %8, %9
  %splat.splat33 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
  %11 = fmul <8 x float> %split6, %splat.splat33
  %12 = fadd <8 x float> %10, %11
  %splat.splat36 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  %13 = fmul <8 x float> %split7, %splat.splat36
  %14 = fadd <8 x float> %12, %13
  %splat.splat39 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
  %15 = fmul <8 x float> %split, %splat.splat39
  %splat.splat42 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
  %16 = fmul <8 x float> %split1, %splat.splat42
  %17 = fadd <8 x float> %15, %16
  %splat.splat45 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
  %18 = fmul <8 x float> %split2, %splat.splat45
  %19 = fadd <8 x float> %17, %18
  %splat.splat48 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
  %20 = fmul <8 x float> %split3, %splat.splat48
  %21 = fadd <8 x float> %19, %20
  %splat.splat51 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
  %22 = fmul <8 x float> %split4, %splat.splat51
  %23 = fadd <8 x float> %21, %22
  %splat.splat54 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
  %24 = fmul <8 x float> %split5, %splat.splat54
  %25 = fadd <8 x float> %23, %24
  %splat.splat57 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
  %26 = fmul <8 x float> %split6, %splat.splat57
  %27 = fadd <8 x float> %25, %26
  %splat.splat60 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %28 = fmul <8 x float> %split7, %splat.splat60
  %29 = fadd <8 x float> %27, %28
  %splat.splat63 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %30 = fmul <8 x float> %split, %splat.splat63
  %splat.splat66 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  %31 = fmul <8 x float> %split1, %splat.splat66
  %32 = fadd <8 x float> %30, %31
  %splat.splat69 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
  %33 = fmul <8 x float> %split2, %splat.splat69
  %34 = fadd <8 x float> %32, %33
  %splat.splat72 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
  %35 = fmul <8 x float> %split3, %splat.splat72
  %36 = fadd <8 x float> %34, %35
  %splat.splat75 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %37 = fmul <8 x float> %split4, %splat.splat75
  %38 = fadd <8 x float> %36, %37
  %splat.splat78 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
  %39 = fmul <8 x float> %split5, %splat.splat78
  %40 = fadd <8 x float> %38, %39
  %splat.splat81 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
  %41 = fmul <8 x float> %split6, %splat.splat81
  %42 = fadd <8 x float> %40, %41
  %splat.splat84 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
  %43 = fmul <8 x float> %split7, %splat.splat84
  %44 = fadd <8 x float> %42, %43
  %splat.splat87 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
  %45 = fmul <8 x float> %split, %splat.splat87
  %splat.splat90 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
  %46 = fmul <8 x float> %split1, %splat.splat90
  %47 = fadd <8 x float> %45, %46
  %splat.splat93 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
  %48 = fmul <8 x float> %split2, %splat.splat93
  %49 = fadd <8 x float> %47, %48
  %splat.splat96 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
  %50 = fmul <8 x float> %split3, %splat.splat96
  %51 = fadd <8 x float> %49, %50
  %splat.splat99 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
  %52 = fmul <8 x float> %split4, %splat.splat99
  %53 = fadd <8 x float> %51, %52
  %splat.splat102 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
  %54 = fmul <8 x float> %split5, %splat.splat102
  %55 = fadd <8 x float> %53, %54
  %splat.splat105 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
  %56 = fmul <8 x float> %split6, %splat.splat105
  %57 = fadd <8 x float> %55, %56
  %splat.splat108 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
  %58 = fmul <8 x float> %split7, %splat.splat108
  %59 = fadd <8 x float> %57, %58
  %splat.splat111 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
  %60 = fmul <8 x float> %split, %splat.splat111
  %splat.splat114 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>
  %61 = fmul <8 x float> %split1, %splat.splat114
  %62 = fadd <8 x float> %60, %61
  %splat.splat117 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34>
  %63 = fmul <8 x float> %split2, %splat.splat117
  %64 = fadd <8 x float> %62, %63
  %splat.splat120 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35>
  %65 = fmul <8 x float> %split3, %splat.splat120
  %66 = fadd <8 x float> %64, %65
  %splat.splat123 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36>
  %67 = fmul <8 x float> %split4, %splat.splat123
  %68 = fadd <8 x float> %66, %67
  %splat.splat126 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37>
  %69 = fmul <8 x float> %split5, %splat.splat126
  %70 = fadd <8 x float> %68, %69
  %splat.splat129 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38>
  %71 = fmul <8 x float> %split6, %splat.splat129
  %72 = fadd <8 x float> %70, %71
  %splat.splat132 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39>
  %73 = fmul <8 x float> %split7, %splat.splat132
  %74 = fadd <8 x float> %72, %73
  %splat.splat135 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
  %75 = fmul <8 x float> %split, %splat.splat135
  %splat.splat138 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41>
  %76 = fmul <8 x float> %split1, %splat.splat138
  %77 = fadd <8 x float> %75, %76
  %splat.splat141 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
  %78 = fmul <8 x float> %split2, %splat.splat141
  %79 = fadd <8 x float> %77, %78
  %splat.splat144 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43>
  %80 = fmul <8 x float> %split3, %splat.splat144
  %81 = fadd <8 x float> %79, %80
  %splat.splat147 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44>
  %82 = fmul <8 x float> %split4, %splat.splat147
  %83 = fadd <8 x float> %81, %82
  %splat.splat150 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45>
  %84 = fmul <8 x float> %split5, %splat.splat150
  %85 = fadd <8 x float> %83, %84
  %splat.splat153 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46>
  %86 = fmul <8 x float> %split6, %splat.splat153
  %87 = fadd <8 x float> %85, %86
  %splat.splat156 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47>
  %88 = fmul <8 x float> %split7, %splat.splat156
  %89 = fadd <8 x float> %87, %88
  %splat.splat159 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
  %90 = fmul <8 x float> %split, %splat.splat159
  %splat.splat162 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49>
  %91 = fmul <8 x float> %split1, %splat.splat162
  %92 = fadd <8 x float> %90, %91
  %splat.splat165 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50>
  %93 = fmul <8 x float> %split2, %splat.splat165
  %94 = fadd <8 x float> %92, %93
  %splat.splat168 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51>
  %95 = fmul <8 x float> %split3, %splat.splat168
  %96 = fadd <8 x float> %94, %95
  %splat.splat171 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52>
  %97 = fmul <8 x float> %split4, %splat.splat171
  %98 = fadd <8 x float> %96, %97
  %splat.splat174 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53>
  %99 = fmul <8 x float> %split5, %splat.splat174
  %100 = fadd <8 x float> %98, %99
  %splat.splat177 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54>
  %101 = fmul <8 x float> %split6, %splat.splat177
  %102 = fadd <8 x float> %100, %101
  %splat.splat180 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55>
  %103 = fmul <8 x float> %split7, %splat.splat180
  %104 = fadd <8 x float> %102, %103
  %splat.splat183 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
  %105 = fmul <8 x float> %split, %splat.splat183
  %splat.splat186 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57>
  %106 = fmul <8 x float> %split1, %splat.splat186
  %107 = fadd <8 x float> %105, %106
  %splat.splat189 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58>
  %108 = fmul <8 x float> %split2, %splat.splat189
  %109 = fadd <8 x float> %107, %108
  %splat.splat192 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59>
  %110 = fmul <8 x float> %split3, %splat.splat192
  %111 = fadd <8 x float> %109, %110
  %splat.splat195 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60>
  %112 = fmul <8 x float> %split4, %splat.splat195
  %113 = fadd <8 x float> %111, %112
  %splat.splat198 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61>
  %114 = fmul <8 x float> %split5, %splat.splat198
  %115 = fadd <8 x float> %113, %114
  %splat.splat201 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62>
  %116 = fmul <8 x float> %split6, %splat.splat201
  %117 = fadd <8 x float> %115, %116
  %splat.splat204 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
  %118 = fmul <8 x float> %split7, %splat.splat204
  %119 = fadd <8 x float> %117, %118
  %120 = shufflevector <8 x float> %14, <8 x float> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %121 = shufflevector <8 x float> %44, <8 x float> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %122 = shufflevector <8 x float> %74, <8 x float> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %123 = shufflevector <8 x float> %104, <8 x float> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %124 = shufflevector <16 x float> %120, <16 x float> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %125 = shufflevector <16 x float> %122, <16 x float> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %126 = shufflevector <32 x float> %124, <32 x float> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  ret <64 x float> %126
}

define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind {
; SSE-LABEL: test_mul8x8_f64:
; SSE:       # %bb.0: # %entry
; SSE-NEXT:    subq $328, %rsp # imm = 0x148
; SSE-NEXT:    movapd %xmm7, %xmm15
; SSE-NEXT:    movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movq %rdi, %rax
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    movapd %xmm13, %xmm12
; SSE-NEXT:    unpcklpd {{.*#+}} xmm12 = xmm12[0],xmm13[0]
; SSE-NEXT:    movapd %xmm3, %xmm10
; SSE-NEXT:    mulpd %xmm12, %xmm10
; SSE-NEXT:    movapd %xmm2, %xmm8
; SSE-NEXT:    mulpd %xmm12, %xmm8
; SSE-NEXT:    movapd %xmm1, %xmm9
; SSE-NEXT:    mulpd %xmm12, %xmm9
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    unpckhpd {{.*#+}} xmm13 = xmm13[1,1]
; SSE-NEXT:    movapd %xmm7, %xmm2
; SSE-NEXT:    mulpd %xmm13, %xmm2
; SSE-NEXT:    addpd %xmm10, %xmm2
; SSE-NEXT:    movapd %xmm6, %xmm7
; SSE-NEXT:    movapd %xmm6, %xmm10
; SSE-NEXT:    mulpd %xmm13, %xmm7
; SSE-NEXT:    addpd %xmm8, %xmm7
; SSE-NEXT:    movapd %xmm5, %xmm8
; SSE-NEXT:    mulpd %xmm13, %xmm8
; SSE-NEXT:    addpd %xmm9, %xmm8
; SSE-NEXT:    mulpd %xmm4, %xmm13
; SSE-NEXT:    addpd %xmm12, %xmm13
; SSE-NEXT:    movapd %xmm11, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0]
; SSE-NEXT:    movapd %xmm14, %xmm1
; SSE-NEXT:    mulpd %xmm6, %xmm1
; SSE-NEXT:    addpd %xmm13, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm8, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm7, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm11, %xmm2
; SSE-NEXT:    addpd %xmm6, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm11, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm11, %xmm5
; SSE-NEXT:    addpd %xmm3, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    addpd %xmm1, %xmm11
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm11, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm6, %xmm7
; SSE-NEXT:    addpd %xmm5, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm6, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm7, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    movapd %xmm6, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm3, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm5, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm1, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT:    movapd %xmm11, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd %xmm15, %xmm8
; SSE-NEXT:    movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm15, %xmm2
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm3, %xmm2
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; SSE-NEXT:    movapd %xmm9, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd %xmm10, %xmm15
; SSE-NEXT:    movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd %xmm10, %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm3, %xmm4
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    movapd %xmm13, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; SSE-NEXT:    movapd %xmm10, %xmm5
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm3, %xmm5
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm12, %xmm0
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm14, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm6, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    addpd %xmm6, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm0, %xmm5
; SSE-NEXT:    addpd %xmm1, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm6, %xmm7
; SSE-NEXT:    addpd %xmm5, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm6, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm7, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm2, %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    movapd %xmm6, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm3, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm5, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    addpd %xmm1, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd %xmm11, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd %xmm8, %xmm2
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm3, %xmm2
; SSE-NEXT:    movapd %xmm9, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd %xmm15, %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm3, %xmm4
; SSE-NEXT:    movapd %xmm13, %xmm8
; SSE-NEXT:    movapd %xmm13, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd %xmm10, %xmm5
; SSE-NEXT:    movapd %xmm10, %xmm15
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm3, %xmm5
; SSE-NEXT:    movapd %xmm12, %xmm10
; SSE-NEXT:    mulpd %xmm12, %xmm0
; SSE-NEXT:    movapd %xmm14, %xmm9
; SSE-NEXT:    mulpd %xmm14, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm6, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm4, %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    addpd %xmm6, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm0, %xmm5
; SSE-NEXT:    addpd %xmm1, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm6
; SSE-NEXT:    unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm6, %xmm7
; SSE-NEXT:    addpd %xmm5, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm6, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    addpd %xmm2, %xmm6
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm6, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm7, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm2, %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    movapd %xmm7, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm3, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm4, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm7, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm7, %xmm0
; SSE-NEXT:    addpd %xmm5, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm7, %xmm0
; SSE-NEXT:    addpd %xmm1, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm7
; SSE-NEXT:    addpd %xmm2, %xmm7
; SSE-NEXT:    movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd %xmm11, %xmm3
; SSE-NEXT:    movapd %xmm11, %xmm12
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    movapd %xmm6, %xmm2
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm3, %xmm2
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; SSE-NEXT:    movapd %xmm11, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    movapd %xmm13, %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm3, %xmm4
; SSE-NEXT:    movapd %xmm8, %xmm3
; SSE-NEXT:    movapd %xmm8, %xmm14
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd %xmm15, %xmm8
; SSE-NEXT:    movapd %xmm15, %xmm5
; SSE-NEXT:    mulpd %xmm1, %xmm5
; SSE-NEXT:    addpd %xmm3, %xmm5
; SSE-NEXT:    mulpd %xmm10, %xmm0
; SSE-NEXT:    mulpd %xmm9, %xmm1
; SSE-NEXT:    movapd %xmm9, %xmm10
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm7
; SSE-NEXT:    unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm7, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm7, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm7, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    addpd %xmm2, %xmm7
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    addpd %xmm7, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm0, %xmm5
; SSE-NEXT:    addpd %xmm1, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm7
; SSE-NEXT:    unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm7, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm7, %xmm9
; SSE-NEXT:    addpd %xmm5, %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm7, %xmm5
; SSE-NEXT:    addpd %xmm4, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm7
; SSE-NEXT:    addpd %xmm2, %xmm7
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm7, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm1, %xmm7
; SSE-NEXT:    addpd %xmm9, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    movapd %xmm15, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm15[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm7, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm3, %xmm7
; SSE-NEXT:    addpd %xmm4, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm15 = xmm15[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm15, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm15, %xmm0
; SSE-NEXT:    addpd %xmm7, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm15, %xmm0
; SSE-NEXT:    addpd %xmm1, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT:    addpd %xmm2, %xmm15
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd %xmm12, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd %xmm6, %xmm2
; SSE-NEXT:    movapd %xmm6, %xmm12
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm3, %xmm2
; SSE-NEXT:    mulpd %xmm0, %xmm11
; SSE-NEXT:    movapd %xmm13, %xmm6
; SSE-NEXT:    movapd %xmm13, %xmm4
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm11, %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm14
; SSE-NEXT:    movapd %xmm8, %xmm7
; SSE-NEXT:    mulpd %xmm1, %xmm7
; SSE-NEXT:    addpd %xmm14, %xmm7
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm8, %xmm0
; SSE-NEXT:    movapd %xmm10, %xmm5
; SSE-NEXT:    mulpd %xmm10, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm9
; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm9, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm9, %xmm1
; SSE-NEXT:    addpd %xmm7, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm9, %xmm7
; SSE-NEXT:    addpd %xmm4, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    addpd %xmm2, %xmm9
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    addpd %xmm9, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    addpd %xmm7, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm0, %xmm7
; SSE-NEXT:    addpd %xmm1, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm9
; SSE-NEXT:    unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm9, %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    mulpd %xmm9, %xmm10
; SSE-NEXT:    addpd %xmm7, %xmm10
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm9, %xmm7
; SSE-NEXT:    addpd %xmm4, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    addpd %xmm2, %xmm9
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm9, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm1, %xmm9
; SSE-NEXT:    addpd %xmm7, %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm1, %xmm7
; SSE-NEXT:    addpd %xmm10, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    movapd %xmm11, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm11[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm7, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm3, %xmm7
; SSE-NEXT:    addpd %xmm9, %xmm7
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    addpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm11 = xmm11[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm11, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm11, %xmm0
; SSE-NEXT:    addpd %xmm7, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm11, %xmm0
; SSE-NEXT:    addpd %xmm1, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT:    addpd %xmm2, %xmm11
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    movapd %xmm13, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd %xmm12, %xmm2
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm3, %xmm2
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
; SSE-NEXT:    movapd %xmm14, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd %xmm6, %xmm7
; SSE-NEXT:    mulpd %xmm1, %xmm7
; SSE-NEXT:    addpd %xmm3, %xmm7
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT:    movapd %xmm4, %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    movapd %xmm6, %xmm9
; SSE-NEXT:    mulpd %xmm1, %xmm9
; SSE-NEXT:    addpd %xmm3, %xmm9
; SSE-NEXT:    mulpd %xmm8, %xmm0
; SSE-NEXT:    mulpd %xmm5, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm10
; SSE-NEXT:    unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm10, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulpd %xmm10, %xmm12
; SSE-NEXT:    addpd %xmm9, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm10, %xmm9
; SSE-NEXT:    addpd %xmm7, %xmm9
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    addpd %xmm2, %xmm10
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm0, %xmm1
; SSE-NEXT:    addpd %xmm10, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    mulpd %xmm0, %xmm10
; SSE-NEXT:    addpd %xmm9, %xmm10
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm0, %xmm9
; SSE-NEXT:    addpd %xmm12, %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm2, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    movapd %xmm7, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm0, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulpd %xmm3, %xmm12
; SSE-NEXT:    addpd %xmm9, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm3, %xmm9
; SSE-NEXT:    addpd %xmm10, %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    addpd %xmm1, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm7 = xmm7[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm7, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    mulpd %xmm7, %xmm10
; SSE-NEXT:    addpd %xmm9, %xmm10
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm7, %xmm9
; SSE-NEXT:    addpd %xmm12, %xmm9
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    addpd %xmm2, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT:    movapd %xmm8, %xmm2
; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm2, %xmm1
; SSE-NEXT:    addpd %xmm7, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulpd %xmm2, %xmm12
; SSE-NEXT:    addpd %xmm9, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm2, %xmm7
; SSE-NEXT:    addpd %xmm10, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm3, %xmm2
; SSE-NEXT:    addpd %xmm0, %xmm2
; SSE-NEXT:    unpckhpd {{.*#+}} xmm8 = xmm8[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm8, %xmm0
; SSE-NEXT:    addpd %xmm2, %xmm0
; SSE-NEXT:    movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm8, %xmm0
; SSE-NEXT:    addpd %xmm7, %xmm0
; SSE-NEXT:    movapd %xmm0, (%rsp) # 16-byte Spill
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT:    mulpd %xmm8, %xmm9
; SSE-NEXT:    addpd %xmm12, %xmm9
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm8
; SSE-NEXT:    addpd %xmm1, %xmm8
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd %xmm13, %xmm12
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm1, %xmm3
; SSE-NEXT:    addpd %xmm12, %xmm3
; SSE-NEXT:    movapd %xmm14, %xmm12
; SSE-NEXT:    movapd %xmm14, %xmm5
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm1, %xmm13
; SSE-NEXT:    addpd %xmm12, %xmm13
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    movapd %xmm6, %xmm14
; SSE-NEXT:    mulpd %xmm1, %xmm14
; SSE-NEXT:    addpd %xmm4, %xmm14
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm10, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    movapd %xmm2, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    addpd %xmm1, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm0, %xmm1
; SSE-NEXT:    addpd %xmm14, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulpd %xmm0, %xmm14
; SSE-NEXT:    addpd %xmm13, %xmm14
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulpd %xmm2, %xmm13
; SSE-NEXT:    addpd %xmm0, %xmm13
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm2, %xmm0
; SSE-NEXT:    addpd %xmm14, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulpd %xmm2, %xmm14
; SSE-NEXT:    addpd %xmm1, %xmm14
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm12, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    movapd %xmm12, %xmm1
; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm12[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm1, %xmm3
; SSE-NEXT:    addpd %xmm2, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    mulpd %xmm1, %xmm2
; SSE-NEXT:    addpd %xmm14, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulpd %xmm1, %xmm14
; SSE-NEXT:    addpd %xmm0, %xmm14
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm1
; SSE-NEXT:    addpd %xmm13, %xmm1
; SSE-NEXT:    unpckhpd {{.*#+}} xmm12 = xmm12[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm12, %xmm4
; SSE-NEXT:    addpd %xmm1, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulpd %xmm12, %xmm13
; SSE-NEXT:    addpd %xmm14, %xmm13
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulpd %xmm12, %xmm14
; SSE-NEXT:    addpd %xmm2, %xmm14
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    addpd %xmm3, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT:    movapd %xmm2, %xmm3
; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm3, %xmm1
; SSE-NEXT:    addpd %xmm12, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm12
; SSE-NEXT:    mulpd %xmm3, %xmm12
; SSE-NEXT:    addpd %xmm14, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm3, %xmm0
; SSE-NEXT:    addpd %xmm13, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm7, %xmm3
; SSE-NEXT:    addpd %xmm4, %xmm3
; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm14
; SSE-NEXT:    mulpd %xmm2, %xmm14
; SSE-NEXT:    addpd %xmm3, %xmm14
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm13
; SSE-NEXT:    mulpd %xmm2, %xmm13
; SSE-NEXT:    addpd %xmm0, %xmm13
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT:    mulpd %xmm2, %xmm7
; SSE-NEXT:    addpd %xmm12, %xmm7
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm0, %xmm2
; SSE-NEXT:    addpd %xmm1, %xmm2
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm0
; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm0, %xmm12
; SSE-NEXT:    mulpd %xmm0, %xmm5
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    mulpd %xmm6, %xmm0
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm12, %xmm4
; SSE-NEXT:    movapd %xmm4, %xmm12
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm5, %xmm4
; SSE-NEXT:    movapd %xmm4, %xmm5
; SSE-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT:    mulpd %xmm1, %xmm4
; SSE-NEXT:    addpd %xmm3, %xmm4
; SSE-NEXT:    movapd %xmm4, %xmm3
; SSE-NEXT:    mulpd %xmm10, %xmm1
; SSE-NEXT:    addpd %xmm0, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm4
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    mulpd %xmm4, %xmm10
; SSE-NEXT:    addpd %xmm1, %xmm10
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm4, %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm4, %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm5
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    addpd %xmm12, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm0, %xmm1
; SSE-NEXT:    addpd %xmm4, %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm12
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    mulpd %xmm0, %xmm6
; SSE-NEXT:    addpd %xmm5, %xmm6
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm0, %xmm1
; SSE-NEXT:    addpd %xmm3, %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm3
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm10, %xmm0
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    movapd %xmm1, %xmm4
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm4, %xmm5
; SSE-NEXT:    addpd %xmm0, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm4, %xmm0
; SSE-NEXT:    addpd %xmm3, %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm10
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm4, %xmm0
; SSE-NEXT:    addpd %xmm6, %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm6
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    addpd %xmm12, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm4, %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm6, %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm6
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    mulpd %xmm1, %xmm0
; SSE-NEXT:    addpd %xmm10, %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm10
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    addpd %xmm5, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    movapd %xmm0, %xmm4
; SSE-NEXT:    unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT:    mulpd %xmm4, %xmm5
; SSE-NEXT:    addpd %xmm1, %xmm5
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT:    mulpd %xmm4, %xmm1
; SSE-NEXT:    addpd %xmm10, %xmm1
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT:    mulpd %xmm4, %xmm10
; SSE-NEXT:    addpd %xmm6, %xmm10
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    addpd %xmm3, %xmm4
; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT:    mulpd %xmm0, %xmm3
; SSE-NEXT:    addpd %xmm4, %xmm3
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT:    mulpd %xmm0, %xmm4
; SSE-NEXT:    addpd %xmm10, %xmm4
; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT:    mulpd %xmm0, %xmm6
; SSE-NEXT:    addpd %xmm1, %xmm6
; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT:    addpd %xmm5, %xmm0
; SSE-NEXT:    movapd %xmm3, 496(%rdi)
; SSE-NEXT:    movapd %xmm4, 480(%rdi)
; SSE-NEXT:    movapd %xmm6, 464(%rdi)
; SSE-NEXT:    movapd %xmm0, 448(%rdi)
; SSE-NEXT:    movapd %xmm14, 432(%rdi)
; SSE-NEXT:    movapd %xmm13, 416(%rdi)
; SSE-NEXT:    movapd %xmm7, 400(%rdi)
; SSE-NEXT:    movapd %xmm2, 384(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 368(%rdi)
; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 352(%rdi)
; SSE-NEXT:    movapd %xmm9, 336(%rdi)
; SSE-NEXT:    movapd %xmm8, 320(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 304(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 288(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 272(%rdi)
; SSE-NEXT:    movapd %xmm11, 256(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 240(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 224(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 208(%rdi)
; SSE-NEXT:    movapd %xmm15, 192(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 176(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 160(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 144(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 128(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 112(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 96(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 80(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 64(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 48(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 32(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, 16(%rdi)
; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT:    movaps %xmm0, (%rdi)
; SSE-NEXT:    addq $328, %rsp # imm = 0x148
; SSE-NEXT:    retq
;
; AVX1-LABEL: test_mul8x8_f64:
; AVX1:       # %bb.0: # %entry
; AVX1-NEXT:    pushq %rbp
; AVX1-NEXT:    movq %rsp, %rbp
; AVX1-NEXT:    andq $-32, %rsp
; AVX1-NEXT:    subq $448, %rsp # imm = 0x1C0
; AVX1-NEXT:    vmovapd %ymm2, %ymm12
; AVX1-NEXT:    vmovapd %ymm0, (%rsp) # 32-byte Spill
; AVX1-NEXT:    movq %rdi, %rax
; AVX1-NEXT:    vmovapd 144(%rbp), %ymm2
; AVX1-NEXT:    vmovapd 112(%rbp), %ymm13
; AVX1-NEXT:    vbroadcastsd 272(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm1, %ymm10, %ymm8
; AVX1-NEXT:    vmovapd %ymm1, %ymm9
; AVX1-NEXT:    vmulpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 280(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm11, %ymm8, %ymm1
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 288(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 296(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 304(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 312(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX1-NEXT:    vmovapd %ymm13, %ymm14
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 320(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX1-NEXT:    vmovapd %ymm2, %ymm13
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd 176(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 328(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 336(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX1-NEXT:    vbroadcastsd 344(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX1-NEXT:    vmovapd %ymm3, %ymm8
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd (%rsp), %ymm15 # 32-byte Reload
; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 352(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX1-NEXT:    vmovapd %ymm5, %ymm3
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 360(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 368(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 16(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 376(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd 80(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 384(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmovapd 176(%rbp), %ymm14
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 392(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 240(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 400(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX1-NEXT:    vbroadcastsd 408(%rbp), %ymm10
; AVX1-NEXT:    vmovapd %ymm8, %ymm5
; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 416(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX1-NEXT:    vmovapd %ymm3, %ymm2
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 424(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 432(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 440(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 448(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 456(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 464(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX1-NEXT:    vmovapd %ymm9, %ymm13
; AVX1-NEXT:    vbroadcastsd 472(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX1-NEXT:    vmovapd %ymm15, %ymm9
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 480(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vmovapd %ymm4, %ymm3
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmovapd %ymm2, %ymm15
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 488(%rbp), %ymm10
; AVX1-NEXT:    vmovapd %ymm7, %ymm8
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm6, %ymm7
; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 496(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmovapd 48(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 504(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 112(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd 80(%rbp), %ymm14
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 512(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 144(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmovapd 176(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 520(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 528(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
; AVX1-NEXT:    vbroadcastsd 536(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm11
; AVX1-NEXT:    vmovapd %ymm5, %ymm6
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX1-NEXT:    vmovapd %ymm12, %ymm5
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 544(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX1-NEXT:    vmovapd %ymm3, %ymm12
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 552(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 560(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
; AVX1-NEXT:    vmovapd %ymm4, %ymm3
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 568(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 576(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 144(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 584(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 240(%rbp), %ymm14
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmovapd 208(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 592(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
; AVX1-NEXT:    vbroadcastsd 600(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm6, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
; AVX1-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 608(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm12, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 616(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 624(%rbp), %ymm10
; AVX1-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 632(%rbp), %ymm10
; AVX1-NEXT:    vmovapd 112(%rbp), %ymm3
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd 80(%rbp), %ymm3
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX1-NEXT:    vbroadcastsd 640(%rbp), %ymm10
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX1-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX1-NEXT:    vmovapd 176(%rbp), %ymm3
; AVX1-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX1-NEXT:    vbroadcastsd 648(%rbp), %ymm10
; AVX1-NEXT:    vmovapd %ymm14, %ymm4
; AVX1-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX1-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX1-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX1-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-NEXT:    vbroadcastsd 656(%rbp), %ymm2
; AVX1-NEXT:    vmovapd %ymm13, %ymm3
; AVX1-NEXT:    vmulpd %ymm2, %ymm13, %ymm1
; AVX1-NEXT:    vbroadcastsd 664(%rbp), %ymm0
; AVX1-NEXT:    vmulpd %ymm0, %ymm6, %ymm14
; AVX1-NEXT:    vmovapd %ymm6, %ymm10
; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
; AVX1-NEXT:    vmulpd %ymm0, %ymm5, %ymm0
; AVX1-NEXT:    vmovapd %ymm5, %ymm6
; AVX1-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT:    vbroadcastsd 672(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm12, %ymm14
; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX1-NEXT:    vmulpd %ymm2, %ymm15, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX1-NEXT:    vbroadcastsd 680(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm8, %ymm14
; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX1-NEXT:    vmulpd %ymm2, %ymm7, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastsd 688(%rbp), %ymm2
; AVX1-NEXT:    vmovapd 16(%rbp), %ymm11
; AVX1-NEXT:    vmulpd %ymm2, %ymm11, %ymm14
; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX1-NEXT:    vmulpd 48(%rbp), %ymm2, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX1-NEXT:    vbroadcastsd 696(%rbp), %ymm2
; AVX1-NEXT:    vmovapd 112(%rbp), %ymm5
; AVX1-NEXT:    vmulpd %ymm2, %ymm5, %ymm14
; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX1-NEXT:    vmovapd 80(%rbp), %ymm5
; AVX1-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastsd 704(%rbp), %ymm2
; AVX1-NEXT:    vmulpd 144(%rbp), %ymm2, %ymm14
; AVX1-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX1-NEXT:    vmovapd 176(%rbp), %ymm13
; AVX1-NEXT:    vmulpd %ymm2, %ymm13, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX1-NEXT:    vbroadcastsd 712(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm4, %ymm14
; AVX1-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX1-NEXT:    vmovapd 208(%rbp), %ymm14
; AVX1-NEXT:    vmulpd %ymm2, %ymm14, %ymm2
; AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT:    vbroadcastsd 720(%rbp), %ymm2
; AVX1-NEXT:    vmulpd %ymm2, %ymm3, %ymm3
; AVX1-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
; AVX1-NEXT:    vbroadcastsd 728(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm10, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX1-NEXT:    vmulpd %ymm4, %ymm6, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT:    vbroadcastsd 736(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm12, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX1-NEXT:    vmulpd %ymm4, %ymm15, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT:    vbroadcastsd 744(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm8, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX1-NEXT:    vmulpd %ymm4, %ymm7, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT:    vbroadcastsd 752(%rbp), %ymm4
; AVX1-NEXT:    vmulpd %ymm4, %ymm11, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX1-NEXT:    vmulpd 48(%rbp), %ymm4, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT:    vbroadcastsd 760(%rbp), %ymm4
; AVX1-NEXT:    vmulpd 112(%rbp), %ymm4, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX1-NEXT:    vmulpd 80(%rbp), %ymm4, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT:    vbroadcastsd 768(%rbp), %ymm4
; AVX1-NEXT:    vmulpd 144(%rbp), %ymm4, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX1-NEXT:    vmulpd %ymm4, %ymm13, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX1-NEXT:    vbroadcastsd 776(%rbp), %ymm4
; AVX1-NEXT:    vmulpd 240(%rbp), %ymm4, %ymm5
; AVX1-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX1-NEXT:    vmulpd %ymm4, %ymm14, %ymm4
; AVX1-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT:    vmovapd %ymm3, 480(%rdi)
; AVX1-NEXT:    vmovapd %ymm2, 448(%rdi)
; AVX1-NEXT:    vmovapd %ymm1, 416(%rdi)
; AVX1-NEXT:    vmovapd %ymm0, 384(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 352(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 320(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 288(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 256(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 224(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 192(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 160(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 128(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 96(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 64(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
; AVX1-NEXT:    movq %rbp, %rsp
; AVX1-NEXT:    popq %rbp
; AVX1-NEXT:    vzeroupper
; AVX1-NEXT:    retq
;
; AVX2-LABEL: test_mul8x8_f64:
; AVX2:       # %bb.0: # %entry
; AVX2-NEXT:    pushq %rbp
; AVX2-NEXT:    movq %rsp, %rbp
; AVX2-NEXT:    andq $-32, %rsp
; AVX2-NEXT:    subq $448, %rsp # imm = 0x1C0
; AVX2-NEXT:    vmovapd %ymm2, %ymm12
; AVX2-NEXT:    vmovapd %ymm0, (%rsp) # 32-byte Spill
; AVX2-NEXT:    movq %rdi, %rax
; AVX2-NEXT:    vmovapd 144(%rbp), %ymm2
; AVX2-NEXT:    vmovapd 112(%rbp), %ymm13
; AVX2-NEXT:    vbroadcastsd 272(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm1, %ymm10, %ymm8
; AVX2-NEXT:    vmovapd %ymm1, %ymm9
; AVX2-NEXT:    vmulpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 280(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm11, %ymm8, %ymm1
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 288(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 296(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 304(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 312(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX2-NEXT:    vmovapd %ymm13, %ymm14
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 320(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX2-NEXT:    vmovapd %ymm2, %ymm13
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd 176(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 328(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 336(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX2-NEXT:    vbroadcastsd 344(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX2-NEXT:    vmovapd %ymm3, %ymm8
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd (%rsp), %ymm15 # 32-byte Reload
; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 352(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX2-NEXT:    vmovapd %ymm5, %ymm3
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 360(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 368(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 16(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 376(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd 80(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 384(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmovapd 176(%rbp), %ymm14
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 392(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 240(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 400(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX2-NEXT:    vbroadcastsd 408(%rbp), %ymm10
; AVX2-NEXT:    vmovapd %ymm8, %ymm5
; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 416(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX2-NEXT:    vmovapd %ymm3, %ymm2
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 424(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 432(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd 48(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 440(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd 80(%rbp), %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 448(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm13, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 456(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 464(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm1
; AVX2-NEXT:    vmovapd %ymm9, %ymm13
; AVX2-NEXT:    vbroadcastsd 472(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm0, %ymm15, %ymm0
; AVX2-NEXT:    vmovapd %ymm15, %ymm9
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 480(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vmovapd %ymm4, %ymm3
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmovapd %ymm2, %ymm15
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 488(%rbp), %ymm10
; AVX2-NEXT:    vmovapd %ymm7, %ymm8
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm6, %ymm7
; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 496(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmovapd 48(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 504(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 112(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd 80(%rbp), %ymm14
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 512(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 144(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmovapd 176(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 520(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 240(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmulpd 208(%rbp), %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 528(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
; AVX2-NEXT:    vbroadcastsd 536(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm11
; AVX2-NEXT:    vmovapd %ymm5, %ymm6
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm10
; AVX2-NEXT:    vmovapd %ymm12, %ymm5
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 544(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX2-NEXT:    vmovapd %ymm3, %ymm12
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 552(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 560(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm10
; AVX2-NEXT:    vmovapd %ymm4, %ymm3
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 568(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 112(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 576(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 144(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 584(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 240(%rbp), %ymm14
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmovapd 208(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 592(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm13, %ymm1
; AVX2-NEXT:    vbroadcastsd 600(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm6, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm0, %ymm9, %ymm0
; AVX2-NEXT:    vmulpd %ymm5, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 608(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm12, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm10, %ymm15, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 616(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm10, %ymm8, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmulpd %ymm7, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 624(%rbp), %ymm10
; AVX2-NEXT:    vmulpd 16(%rbp), %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 632(%rbp), %ymm10
; AVX2-NEXT:    vmovapd 112(%rbp), %ymm3
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd 80(%rbp), %ymm3
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm0, %ymm10, %ymm0
; AVX2-NEXT:    vbroadcastsd 640(%rbp), %ymm10
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm11
; AVX2-NEXT:    vaddpd %ymm0, %ymm11, %ymm0
; AVX2-NEXT:    vmovapd 176(%rbp), %ymm3
; AVX2-NEXT:    vmulpd %ymm3, %ymm10, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm10, %ymm1
; AVX2-NEXT:    vbroadcastsd 648(%rbp), %ymm10
; AVX2-NEXT:    vmovapd %ymm14, %ymm4
; AVX2-NEXT:    vmulpd %ymm10, %ymm14, %ymm11
; AVX2-NEXT:    vaddpd %ymm1, %ymm11, %ymm1
; AVX2-NEXT:    vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vmulpd %ymm2, %ymm10, %ymm1
; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT:    vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT:    vbroadcastsd 656(%rbp), %ymm2
; AVX2-NEXT:    vmovapd %ymm13, %ymm3
; AVX2-NEXT:    vmulpd %ymm2, %ymm13, %ymm1
; AVX2-NEXT:    vbroadcastsd 664(%rbp), %ymm0
; AVX2-NEXT:    vmulpd %ymm0, %ymm6, %ymm14
; AVX2-NEXT:    vmovapd %ymm6, %ymm10
; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
; AVX2-NEXT:    vmulpd %ymm0, %ymm5, %ymm0
; AVX2-NEXT:    vmovapd %ymm5, %ymm6
; AVX2-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT:    vbroadcastsd 672(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm12, %ymm14
; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX2-NEXT:    vmulpd %ymm2, %ymm15, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vbroadcastsd 680(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm8, %ymm14
; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX2-NEXT:    vmulpd %ymm2, %ymm7, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastsd 688(%rbp), %ymm2
; AVX2-NEXT:    vmovapd 16(%rbp), %ymm11
; AVX2-NEXT:    vmulpd %ymm2, %ymm11, %ymm14
; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX2-NEXT:    vmulpd 48(%rbp), %ymm2, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vbroadcastsd 696(%rbp), %ymm2
; AVX2-NEXT:    vmovapd 112(%rbp), %ymm5
; AVX2-NEXT:    vmulpd %ymm2, %ymm5, %ymm14
; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX2-NEXT:    vmovapd 80(%rbp), %ymm5
; AVX2-NEXT:    vmulpd %ymm2, %ymm5, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastsd 704(%rbp), %ymm2
; AVX2-NEXT:    vmulpd 144(%rbp), %ymm2, %ymm14
; AVX2-NEXT:    vaddpd %ymm0, %ymm14, %ymm0
; AVX2-NEXT:    vmovapd 176(%rbp), %ymm13
; AVX2-NEXT:    vmulpd %ymm2, %ymm13, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT:    vbroadcastsd 712(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm4, %ymm14
; AVX2-NEXT:    vaddpd %ymm1, %ymm14, %ymm1
; AVX2-NEXT:    vmovapd 208(%rbp), %ymm14
; AVX2-NEXT:    vmulpd %ymm2, %ymm14, %ymm2
; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
; AVX2-NEXT:    vbroadcastsd 720(%rbp), %ymm2
; AVX2-NEXT:    vmulpd %ymm2, %ymm3, %ymm3
; AVX2-NEXT:    vmulpd %ymm2, %ymm9, %ymm2
; AVX2-NEXT:    vbroadcastsd 728(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm10, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX2-NEXT:    vmulpd %ymm4, %ymm6, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX2-NEXT:    vbroadcastsd 736(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm12, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX2-NEXT:    vmulpd %ymm4, %ymm15, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX2-NEXT:    vbroadcastsd 744(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm8, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX2-NEXT:    vmulpd %ymm4, %ymm7, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX2-NEXT:    vbroadcastsd 752(%rbp), %ymm4
; AVX2-NEXT:    vmulpd %ymm4, %ymm11, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX2-NEXT:    vmulpd 48(%rbp), %ymm4, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX2-NEXT:    vbroadcastsd 760(%rbp), %ymm4
; AVX2-NEXT:    vmulpd 112(%rbp), %ymm4, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX2-NEXT:    vmulpd 80(%rbp), %ymm4, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX2-NEXT:    vbroadcastsd 768(%rbp), %ymm4
; AVX2-NEXT:    vmulpd 144(%rbp), %ymm4, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm2, %ymm2
; AVX2-NEXT:    vmulpd %ymm4, %ymm13, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm3, %ymm3
; AVX2-NEXT:    vbroadcastsd 776(%rbp), %ymm4
; AVX2-NEXT:    vmulpd 240(%rbp), %ymm4, %ymm5
; AVX2-NEXT:    vaddpd %ymm5, %ymm3, %ymm3
; AVX2-NEXT:    vmulpd %ymm4, %ymm14, %ymm4
; AVX2-NEXT:    vaddpd %ymm4, %ymm2, %ymm2
; AVX2-NEXT:    vmovapd %ymm3, 480(%rdi)
; AVX2-NEXT:    vmovapd %ymm2, 448(%rdi)
; AVX2-NEXT:    vmovapd %ymm1, 416(%rdi)
; AVX2-NEXT:    vmovapd %ymm0, 384(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 352(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 320(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 288(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 256(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 224(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 192(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 160(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 128(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 96(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 64(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, 32(%rdi)
; AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
; AVX2-NEXT:    movq %rbp, %rsp
; AVX2-NEXT:    popq %rbp
; AVX2-NEXT:    vzeroupper
; AVX2-NEXT:    retq
;
; AVX512-LABEL: test_mul8x8_f64:
; AVX512:       # %bb.0: # %entry
; AVX512-NEXT:    pushq %rbp
; AVX512-NEXT:    movq %rsp, %rbp
; AVX512-NEXT:    andq $-64, %rsp
; AVX512-NEXT:    subq $64, %rsp
; AVX512-NEXT:    movq %rdi, %rax
; AVX512-NEXT:    vmulpd 16(%rbp){1to8}, %zmm0, %zmm8
; AVX512-NEXT:    vmulpd 24(%rbp){1to8}, %zmm1, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 32(%rbp){1to8}, %zmm2, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 40(%rbp){1to8}, %zmm3, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 48(%rbp){1to8}, %zmm4, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 56(%rbp){1to8}, %zmm5, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 64(%rbp){1to8}, %zmm6, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 72(%rbp){1to8}, %zmm7, %zmm9
; AVX512-NEXT:    vaddpd %zmm9, %zmm8, %zmm8
; AVX512-NEXT:    vmulpd 80(%rbp){1to8}, %zmm0, %zmm9
; AVX512-NEXT:    vmulpd 88(%rbp){1to8}, %zmm1, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 96(%rbp){1to8}, %zmm2, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 104(%rbp){1to8}, %zmm3, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 112(%rbp){1to8}, %zmm4, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 120(%rbp){1to8}, %zmm5, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 128(%rbp){1to8}, %zmm6, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 136(%rbp){1to8}, %zmm7, %zmm10
; AVX512-NEXT:    vaddpd %zmm10, %zmm9, %zmm9
; AVX512-NEXT:    vmulpd 144(%rbp){1to8}, %zmm0, %zmm10
; AVX512-NEXT:    vmulpd 152(%rbp){1to8}, %zmm1, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 160(%rbp){1to8}, %zmm2, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 168(%rbp){1to8}, %zmm3, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 176(%rbp){1to8}, %zmm4, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 184(%rbp){1to8}, %zmm5, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 192(%rbp){1to8}, %zmm6, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 200(%rbp){1to8}, %zmm7, %zmm11
; AVX512-NEXT:    vaddpd %zmm11, %zmm10, %zmm10
; AVX512-NEXT:    vmulpd 208(%rbp){1to8}, %zmm0, %zmm11
; AVX512-NEXT:    vmulpd 216(%rbp){1to8}, %zmm1, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 224(%rbp){1to8}, %zmm2, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 232(%rbp){1to8}, %zmm3, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 240(%rbp){1to8}, %zmm4, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 248(%rbp){1to8}, %zmm5, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 256(%rbp){1to8}, %zmm6, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 264(%rbp){1to8}, %zmm7, %zmm12
; AVX512-NEXT:    vaddpd %zmm12, %zmm11, %zmm11
; AVX512-NEXT:    vmulpd 272(%rbp){1to8}, %zmm0, %zmm12
; AVX512-NEXT:    vmulpd 280(%rbp){1to8}, %zmm1, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 288(%rbp){1to8}, %zmm2, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 296(%rbp){1to8}, %zmm3, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 304(%rbp){1to8}, %zmm4, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 312(%rbp){1to8}, %zmm5, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 320(%rbp){1to8}, %zmm6, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 328(%rbp){1to8}, %zmm7, %zmm13
; AVX512-NEXT:    vaddpd %zmm13, %zmm12, %zmm12
; AVX512-NEXT:    vmulpd 336(%rbp){1to8}, %zmm0, %zmm13
; AVX512-NEXT:    vmulpd 344(%rbp){1to8}, %zmm1, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 352(%rbp){1to8}, %zmm2, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 360(%rbp){1to8}, %zmm3, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 368(%rbp){1to8}, %zmm4, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 376(%rbp){1to8}, %zmm5, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 384(%rbp){1to8}, %zmm6, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 392(%rbp){1to8}, %zmm7, %zmm14
; AVX512-NEXT:    vaddpd %zmm14, %zmm13, %zmm13
; AVX512-NEXT:    vmulpd 400(%rbp){1to8}, %zmm0, %zmm14
; AVX512-NEXT:    vmulpd 408(%rbp){1to8}, %zmm1, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 416(%rbp){1to8}, %zmm2, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 424(%rbp){1to8}, %zmm3, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 432(%rbp){1to8}, %zmm4, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 440(%rbp){1to8}, %zmm5, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 448(%rbp){1to8}, %zmm6, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 456(%rbp){1to8}, %zmm7, %zmm15
; AVX512-NEXT:    vaddpd %zmm15, %zmm14, %zmm14
; AVX512-NEXT:    vmulpd 464(%rbp){1to8}, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 472(%rbp){1to8}, %zmm1, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 480(%rbp){1to8}, %zmm2, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 488(%rbp){1to8}, %zmm3, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 496(%rbp){1to8}, %zmm4, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 504(%rbp){1to8}, %zmm5, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 512(%rbp){1to8}, %zmm6, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmulpd 520(%rbp){1to8}, %zmm7, %zmm1
; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT:    vmovapd %zmm0, 448(%rdi)
; AVX512-NEXT:    vmovapd %zmm14, 384(%rdi)
; AVX512-NEXT:    vmovapd %zmm13, 320(%rdi)
; AVX512-NEXT:    vmovapd %zmm12, 256(%rdi)
; AVX512-NEXT:    vmovapd %zmm11, 192(%rdi)
; AVX512-NEXT:    vmovapd %zmm10, 128(%rdi)
; AVX512-NEXT:    vmovapd %zmm9, 64(%rdi)
; AVX512-NEXT:    vmovapd %zmm8, (%rdi)
; AVX512-NEXT:    movq %rbp, %rsp
; AVX512-NEXT:    popq %rbp
; AVX512-NEXT:    vzeroupper
; AVX512-NEXT:    retq
entry:
  %split = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %split1 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %split2 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
  %split3 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %split4 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
  %split5 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
  %split6 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
  %split7 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  %splat.splat = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> zeroinitializer
  %0 = fmul <8 x double> %split, %splat.splat
  %splat.splat18 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
  %1 = fmul <8 x double> %split1, %splat.splat18
  %2 = fadd <8 x double> %0, %1
  %splat.splat21 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
  %3 = fmul <8 x double> %split2, %splat.splat21
  %4 = fadd <8 x double> %2, %3
  %splat.splat24 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  %5 = fmul <8 x double> %split3, %splat.splat24
  %6 = fadd <8 x double> %4, %5
  %splat.splat27 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
  %7 = fmul <8 x double> %split4, %splat.splat27
  %8 = fadd <8 x double> %6, %7
  %splat.splat30 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  %9 = fmul <8 x double> %split5, %splat.splat30
  %10 = fadd <8 x double> %8, %9
  %splat.splat33 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
  %11 = fmul <8 x double> %split6, %splat.splat33
  %12 = fadd <8 x double> %10, %11
  %splat.splat36 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  %13 = fmul <8 x double> %split7, %splat.splat36
  %14 = fadd <8 x double> %12, %13
  %splat.splat39 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
  %15 = fmul <8 x double> %split, %splat.splat39
  %splat.splat42 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
  %16 = fmul <8 x double> %split1, %splat.splat42
  %17 = fadd <8 x double> %15, %16
  %splat.splat45 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
  %18 = fmul <8 x double> %split2, %splat.splat45
  %19 = fadd <8 x double> %17, %18
  %splat.splat48 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
  %20 = fmul <8 x double> %split3, %splat.splat48
  %21 = fadd <8 x double> %19, %20
  %splat.splat51 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
  %22 = fmul <8 x double> %split4, %splat.splat51
  %23 = fadd <8 x double> %21, %22
  %splat.splat54 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
  %24 = fmul <8 x double> %split5, %splat.splat54
  %25 = fadd <8 x double> %23, %24
  %splat.splat57 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
  %26 = fmul <8 x double> %split6, %splat.splat57
  %27 = fadd <8 x double> %25, %26
  %splat.splat60 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
  %28 = fmul <8 x double> %split7, %splat.splat60
  %29 = fadd <8 x double> %27, %28
  %splat.splat63 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %30 = fmul <8 x double> %split, %splat.splat63
  %splat.splat66 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
  %31 = fmul <8 x double> %split1, %splat.splat66
  %32 = fadd <8 x double> %30, %31
  %splat.splat69 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18>
  %33 = fmul <8 x double> %split2, %splat.splat69
  %34 = fadd <8 x double> %32, %33
  %splat.splat72 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
  %35 = fmul <8 x double> %split3, %splat.splat72
  %36 = fadd <8 x double> %34, %35
  %splat.splat75 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20, i32 20>
  %37 = fmul <8 x double> %split4, %splat.splat75
  %38 = fadd <8 x double> %36, %37
  %splat.splat78 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21, i32 21>
  %39 = fmul <8 x double> %split5, %splat.splat78
  %40 = fadd <8 x double> %38, %39
  %splat.splat81 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22>
  %41 = fmul <8 x double> %split6, %splat.splat81
  %42 = fadd <8 x double> %40, %41
  %splat.splat84 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
  %43 = fmul <8 x double> %split7, %splat.splat84
  %44 = fadd <8 x double> %42, %43
  %splat.splat87 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
  %45 = fmul <8 x double> %split, %splat.splat87
  %splat.splat90 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
  %46 = fmul <8 x double> %split1, %splat.splat90
  %47 = fadd <8 x double> %45, %46
  %splat.splat93 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
  %48 = fmul <8 x double> %split2, %splat.splat93
  %49 = fadd <8 x double> %47, %48
  %splat.splat96 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
  %50 = fmul <8 x double> %split3, %splat.splat96
  %51 = fadd <8 x double> %49, %50
  %splat.splat99 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
  %52 = fmul <8 x double> %split4, %splat.splat99
  %53 = fadd <8 x double> %51, %52
  %splat.splat102 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
  %54 = fmul <8 x double> %split5, %splat.splat102
  %55 = fadd <8 x double> %53, %54
  %splat.splat105 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30, i32 30>
  %56 = fmul <8 x double> %split6, %splat.splat105
  %57 = fadd <8 x double> %55, %56
  %splat.splat108 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
  %58 = fmul <8 x double> %split7, %splat.splat108
  %59 = fadd <8 x double> %57, %58
  %splat.splat111 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
  %60 = fmul <8 x double> %split, %splat.splat111
  %splat.splat114 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33, i32 33>
  %61 = fmul <8 x double> %split1, %splat.splat114
  %62 = fadd <8 x double> %60, %61
  %splat.splat117 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34, i32 34>
  %63 = fmul <8 x double> %split2, %splat.splat117
  %64 = fadd <8 x double> %62, %63
  %splat.splat120 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35, i32 35>
  %65 = fmul <8 x double> %split3, %splat.splat120
  %66 = fadd <8 x double> %64, %65
  %splat.splat123 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36, i32 36>
  %67 = fmul <8 x double> %split4, %splat.splat123
  %68 = fadd <8 x double> %66, %67
  %splat.splat126 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37, i32 37>
  %69 = fmul <8 x double> %split5, %splat.splat126
  %70 = fadd <8 x double> %68, %69
  %splat.splat129 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38, i32 38>
  %71 = fmul <8 x double> %split6, %splat.splat129
  %72 = fadd <8 x double> %70, %71
  %splat.splat132 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39, i32 39>
  %73 = fmul <8 x double> %split7, %splat.splat132
  %74 = fadd <8 x double> %72, %73
  %splat.splat135 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
  %75 = fmul <8 x double> %split, %splat.splat135
  %splat.splat138 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41, i32 41>
  %76 = fmul <8 x double> %split1, %splat.splat138
  %77 = fadd <8 x double> %75, %76
  %splat.splat141 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
  %78 = fmul <8 x double> %split2, %splat.splat141
  %79 = fadd <8 x double> %77, %78
  %splat.splat144 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43, i32 43>
  %80 = fmul <8 x double> %split3, %splat.splat144
  %81 = fadd <8 x double> %79, %80
  %splat.splat147 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44, i32 44>
  %82 = fmul <8 x double> %split4, %splat.splat147
  %83 = fadd <8 x double> %81, %82
  %splat.splat150 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45, i32 45>
  %84 = fmul <8 x double> %split5, %splat.splat150
  %85 = fadd <8 x double> %83, %84
  %splat.splat153 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46, i32 46>
  %86 = fmul <8 x double> %split6, %splat.splat153
  %87 = fadd <8 x double> %85, %86
  %splat.splat156 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47, i32 47>
  %88 = fmul <8 x double> %split7, %splat.splat156
  %89 = fadd <8 x double> %87, %88
  %splat.splat159 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48>
  %90 = fmul <8 x double> %split, %splat.splat159
  %splat.splat162 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49, i32 49>
  %91 = fmul <8 x double> %split1, %splat.splat162
  %92 = fadd <8 x double> %90, %91
  %splat.splat165 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50, i32 50>
  %93 = fmul <8 x double> %split2, %splat.splat165
  %94 = fadd <8 x double> %92, %93
  %splat.splat168 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51, i32 51>
  %95 = fmul <8 x double> %split3, %splat.splat168
  %96 = fadd <8 x double> %94, %95
  %splat.splat171 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52, i32 52>
  %97 = fmul <8 x double> %split4, %splat.splat171
  %98 = fadd <8 x double> %96, %97
  %splat.splat174 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53, i32 53>
  %99 = fmul <8 x double> %split5, %splat.splat174
  %100 = fadd <8 x double> %98, %99
  %splat.splat177 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54, i32 54>
  %101 = fmul <8 x double> %split6, %splat.splat177
  %102 = fadd <8 x double> %100, %101
  %splat.splat180 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55, i32 55>
  %103 = fmul <8 x double> %split7, %splat.splat180
  %104 = fadd <8 x double> %102, %103
  %splat.splat183 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
  %105 = fmul <8 x double> %split, %splat.splat183
  %splat.splat186 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57, i32 57>
  %106 = fmul <8 x double> %split1, %splat.splat186
  %107 = fadd <8 x double> %105, %106
  %splat.splat189 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58, i32 58>
  %108 = fmul <8 x double> %split2, %splat.splat189
  %109 = fadd <8 x double> %107, %108
  %splat.splat192 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59, i32 59>
  %110 = fmul <8 x double> %split3, %splat.splat192
  %111 = fadd <8 x double> %109, %110
  %splat.splat195 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60, i32 60>
  %112 = fmul <8 x double> %split4, %splat.splat195
  %113 = fadd <8 x double> %111, %112
  %splat.splat198 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61, i32 61>
  %114 = fmul <8 x double> %split5, %splat.splat198
  %115 = fadd <8 x double> %113, %114
  %splat.splat201 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62, i32 62>
  %116 = fmul <8 x double> %split6, %splat.splat201
  %117 = fadd <8 x double> %115, %116
  %splat.splat204 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> <i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63, i32 63>
  %118 = fmul <8 x double> %split7, %splat.splat204
  %119 = fadd <8 x double> %117, %118
  %120 = shufflevector <8 x double> %14, <8 x double> %29, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %121 = shufflevector <8 x double> %44, <8 x double> %59, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %122 = shufflevector <8 x double> %74, <8 x double> %89, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %123 = shufflevector <8 x double> %104, <8 x double> %119, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %124 = shufflevector <16 x double> %120, <16 x double> %121, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %125 = shufflevector <16 x double> %122, <16 x double> %123, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %126 = shufflevector <32 x double> %124, <32 x double> %125, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  ret <64 x double> %126
}