llvm/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s

# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s

# LLVM-MCA-BEGIN madd
mul  x0, x0, x0
madd x0, x1, x2, x0
madd x0, x1, x2, x0
madd x0, x0, x0, x0
# LLVM-MCA-END

# LLVM-MCA-BEGIN smaddl
mul    x0, x0, x0
smaddl x0, w1, w2, x0
smaddl x0, w1, w2, x0
smaddl x0, w0, w0, x0
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmadd
fadd  d0, d0, d0
fmadd d0, d1, d2, d0
fmul  d0, d0, d0
fmadd d0, d1, d2, d0
fmadd d0, d1, d2, d0
fmadd d0, d0, d1, d2
# LLVM-MCA-END

# LLVM-MCA-BEGIN saba
mul  v0.4s, v0.4s, v0.4s
saba v0.4s, v1.4s, v2.4s
saba v0.4s, v1.4s, v2.4s
saba v0.4s, v0.4s, v1.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN sadalp
mul    v0.4s, v0.4s, v0.4s
sadalp v0.2d, v1.4s
sadalp v0.2d, v1.4s
sadalp v0.2d, v0.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN sdot
mul  v0.4s, v0.4s,  v0.4s
sdot v0.4s, v1.16b, v2.16b
sdot v0.4s, v1.16b, v2.16b
sdot v0.4s, v0.16b, v1.16b
# LLVM-MCA-END

# LLVM-MCA-BEGIN smmla
mul   v0.4s, v0.4s,  v0.4s
smmla v0.4s, v1.16b, v2.16b
smmla v0.4s, v1.16b, v2.16b
smmla v0.4s, v0.16b, v1.16b
# LLVM-MCA-END

# LLVM-MCA-BEGIN mla
mul v0.4s, v0.4s, v0.4s
mla v0.4s, v1.4s, v2.4s
mla v0.4s, v1.4s, v2.4s
mla v0.4s, v0.4s, v1.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN smlal2
mul    v0.4s, v0.4s, v0.4s
smlal2 v0.4s, v1.8h, v2.8h
smlal2 v0.4s, v1.8h, v2.8h
smlal2 v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN ssra
mul  v0.4s, v0.4s, v0.4s
ssra v0.2d, v1.2d, #1
ssra v0.2d, v1.2d, #1
ssra v0.2d, v0.2d, #1
# LLVM-MCA-END

# LLVM-MCA-BEGIN fcmla
fmul  v0.4s, v0.4s, v0.4s
fcmla v0.2d, v1.2d, v2.2d, #90
fcmla v0.2d, v1.2d, v2.2d, #90
fcmla v0.2d, v0.2d, v1.2d, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmla
fmul v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
fadd v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
fmla v0.2d, v1.2d, v2.2d
fmla v0.2d, v0.2d, v1.2d
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmlal
fmul  v0.2d, v0.2d, v0.2d
fmlal v0.4s, v1.4h, v2.4h
fadd  v0.2d, v0.2d, v0.2d
fmlal v0.4s, v1.4h, v2.4h
fmlal v0.4s, v1.4h, v2.4h
fmlal v0.4s, v0.4h, v1.4h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfdot
fmul  v0.2d, v0.2d, v0.2d
bfdot v0.4s, v1.8h, v2.8h
bfdot v0.4s, v1.8h, v2.8h
bfdot v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmmla
fmul   v0.2d, v0.2d, v0.2d
bfmmla v0.4s, v1.8h, v2.8h
bfmmla v0.4s, v1.8h, v2.8h
bfmmla v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmlalb
fmul    v0.2d, v0.2d, v0.2d
bfmlalb v0.4s, v1.8h, v2.8h
bfmlalb v0.4s, v1.8h, v2.8h
bfmlalb v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN crc32cb
mul    w0, w0, w0
crc32cb w0, w0, w1
crc32cb w0, w0, w1
crc32cb w0, w0, w0
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sdot.s
mul z0.d, p0/m, z0.d, z0.d
sdot z0.s, z1.b, z2.b
sdot z0.s, z1.b, z2.b
sdot z0.s, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sudot
mul z0.d, p0/m, z0.d, z0.d
sdot z0.s, z1.b, z2.b[1]
sdot z0.s, z1.b, z2.b[1]
sdot z0.s, z0.b, z1.b[1]
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sdot.d
mul z0.d, p0/m, z0.d, z0.d
sdot z0.d, z1.h, z2.h
sdot z0.d, z1.h, z2.h
sdot z0.d, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z smmla
mul z0.d, p0/m, z0.d, z0.d
smmla z0.s, z1.b, z2.b
smmla z0.s, z1.b, z2.b
smmla z0.s, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z mla.d
mul z0.d, p0/m, z0.d, z0.d
mla z0.d, p0/m, z1.d, z2.d
mla z0.d, p0/m, z1.d, z2.d
mla z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z mad.d
mul z0.d, p0/m, z0.d, z0.d
mad z0.d, p0/m, z1.d, z2.d
mad z0.d, p0/m, z1.d, z2.d
mad z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z msb.d
mul z0.d, p0/m, z0.d, z0.d
msb z0.d, p0/m, z1.d, z2.d
msb z0.d, p0/m, z1.d, z2.d
msb z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fcmla ZPmZZ
fmul  z0.d, z0.d, z0.d
fcmla z0.d, p0/m, z1.d, z2.d, 90
fcmla z0.d, p0/m, z1.d, z2.d, 90
fcmla z0.d, p0/m, z0.d, z1.d, 90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fcmla ZZZI
fmul  z0.d, z0.d, z0.d
fcmla z0.s, z1.s, z2.s[1], 90
fcmla z0.s, z1.s, z2.s[1], 90
fcmla z0.s, z0.s, z1.s[1], 90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fmla ZPmZZ
fmul z0.d, z0.d, z0.d
fmla z0.d, p0/m, z1.d, z2.d
fmla z0.d, p0/m, z1.d, z2.d
fmla z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fmla ZZZI
fmul z0.d, z0.d, z0.d
fmla z0.d, z1.d, z2.d[1]
fmla z0.d, z1.d, z2.d[1]
fmla z0.d, z0.d, z1.d[1]
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z bfdot
fmul  z0.d, z0.d, z0.d
bfdot z0.s, z1.h, z2.h
bfdot z0.s, z1.h, z2.h
bfdot z0.s, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z bfmmla
fmul   z0.d, z0.d, z0.d
bfmmla z0.s, z1.h, z2.h
bfmmla z0.s, z1.h, z2.h
bfmmla z0.s, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmlalb
fmul    z0.d, z0.d, z0.d
bfmlalb z0.s, z1.h, z2.h
bfmlalb z0.s, z1.h, z2.h
bfmlalb z0.s, z0.h, z1.h
# LLVM-MCA-END

# CHECK:      [0] Code Region - madd

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [1] Code Region - smaddl

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [2] Code Region - fmadd

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.35
# CHECK-NEXT: IPC:               0.35
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeER.    .    .    .    .    .    ..   fadd	d0, d0, d0
# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    ..   fmul	d0, d0, d0
# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
# CHECK-NEXT:        2     15.7   0.1    0.0       <total>

# CHECK:      [3] Code Region - saba

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [4] Code Region - sadalp

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [5] Code Region - sdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1103
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.36
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          01234

# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
# CHECK-NEXT:        2     10.8   0.1    0.0       <total>

# CHECK:      [6] Code Region - smmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1103
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.36
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          01234

# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
# CHECK-NEXT:        2     10.8   0.1    0.0       <total>

# CHECK:      [7] Code Region - mla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [8] Code Region - smlal2

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [9] Code Region - ssra

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [10] Code Region - fcmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [11] Code Region - fmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.35
# CHECK-NEXT: IPC:               0.35
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,2]     D=====eeER.    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
# CHECK-NEXT:        2     15.3   0.1    0.0       <total>

# CHECK:      [12] Code Region - fmlal

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      2203
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456789
# CHECK-NEXT: Index     0123456789          0123456789          0123456

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,2]     D========eeER  .    .    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,4]     D============eeeeeER.    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,5]     D=================eeeeeER.    .    .    .    ..   fmlal	v0.4s, v0.4h, v1.4h
# CHECK-NEXT: [1,0]     D======================eeeER  .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,2]     D==============================eeER.    .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,3]     D================================eeeeeER.    ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,4]     D==================================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,5]     D=======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     12.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     15.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 2.     2     20.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 3.     2     22.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 5.     2     29.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
# CHECK-NEXT:        2     20.3   0.1    0.0       <total>

# CHECK:      [13] Code Region - bfdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [14] Code Region - bfmmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1603
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.25
# CHECK-NEXT: IPC:               0.25
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          01234
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     14.0   0.1    0.0       <total>

# CHECK:      [15] Code Region - bfmlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfmlalb	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D======================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [16] Code Region - crc32cb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	w0, w0, w0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   crc32cb	w0, w0, w1
# CHECK-NEXT: [0,2]     D===eeER  .    ..   crc32cb	w0, w0, w1
# CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32cb	w0, w0, w0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
# CHECK-NEXT: [1,1]     D=========eeER ..   crc32cb	w0, w0, w1
# CHECK-NEXT: [1,2]     D==========eeER..   crc32cb	w0, w0, w1
# CHECK-NEXT: [1,3]     D============eeER   crc32cb	w0, w0, w0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32cb	w0, w0, w1
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32cb	w0, w0, w0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [17] Code Region - Z sdot.s

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [18] Code Region - Z sudot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [19] Code Region - Z sdot.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 5.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [20] Code Region - Z smmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   smmla	z0.s, z0.b, z1.b
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,2]     D==================eeeER ..   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,3]     D=====================eeeER   smmla	z0.s, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [21] Code Region - Z mla.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        800

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.47
# CHECK-NEXT: IPC:               0.23
# CHECK-NEXT: Block RThroughput: 8.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     .D============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     15.4   0.1    0.0       <total>

# CHECK:      [22] Code Region - Z mad.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        800

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.47
# CHECK-NEXT: IPC:               0.23
# CHECK-NEXT: Block RThroughput: 8.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mad	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     .D============================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     15.4   0.1    0.0       <total>

# CHECK:      [23] Code Region - Z msb.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        800

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.47
# CHECK-NEXT: IPC:               0.23
# CHECK-NEXT: Block RThroughput: 8.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   msb	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     .D============================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
# CHECK-NEXT: 1.     2     14.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     16.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     15.4   0.1    0.0       <total>

# CHECK:      [24] Code Region - Z fcmla ZPmZZ

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [25] Code Region - Z fcmla ZZZI

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [26] Code Region - Z fmla ZPmZZ

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [27] Code Region - Z fmla ZZZI

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [28] Code Region - Z bfdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [29] Code Region - Z bfmmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1603
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.25
# CHECK-NEXT: IPC:               0.25
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          01234
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     14.0   0.1    0.0       <total>

# CHECK:      [30] Code Region - bfmlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    15
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>