llvm/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s

# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 -mattr=+sve2-aes,+sve2-sha3,+sve2-sm4 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s

# LLVM-MCA-BEGIN madd
mul  x0, x0, x0
madd x0, x1, x2, x0
madd x0, x1, x2, x0
madd x0, x0, x0, x0
# LLVM-MCA-END

# LLVM-MCA-BEGIN smaddl
mul    x0, x0, x0
smaddl x0, w1, w2, x0
smaddl x0, w1, w2, x0
smaddl x0, w0, w0, x0
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmadd
fadd  d0, d0, d0
fmadd d0, d1, d2, d0
fmul  d0, d0, d0
fmadd d0, d1, d2, d0
fmadd d0, d1, d2, d0
fmadd d0, d0, d1, d2
# LLVM-MCA-END

# LLVM-MCA-BEGIN saba
mul  v0.4s, v0.4s, v0.4s
saba v0.4s, v1.4s, v2.4s
saba v0.4s, v1.4s, v2.4s
saba v0.4s, v0.4s, v1.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN sdot
mul  v0.4s, v0.4s,  v0.4s
sdot v0.4s, v1.16b, v2.16b
sdot v0.4s, v1.16b, v2.16b
sdot v0.4s, v0.16b, v1.16b
# LLVM-MCA-END

# LLVM-MCA-BEGIN smmla
mul   v0.4s, v0.4s,  v0.4s
smmla v0.4s, v1.16b, v2.16b
smmla v0.4s, v1.16b, v2.16b
smmla v0.4s, v0.16b, v1.16b
# LLVM-MCA-END

# LLVM-MCA-BEGIN mla
mul v0.4s, v0.4s, v0.4s
mla v0.4s, v1.4s, v2.4s
mla v0.4s, v1.4s, v2.4s
mla v0.4s, v0.4s, v1.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN sqrdmlah
mul      v0.4s, v0.4s, v0.4s
sqrdmlah v0.4s, v1.4s, v2.4s
sqrdmlah v0.4s, v1.4s, v2.4s
sqrdmlah v0.4s, v0.4s, v1.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN smlal2
mul    v0.4s, v0.4s, v0.4s
smlal2 v0.4s, v1.8h, v2.8h
smlal2 v0.4s, v1.8h, v2.8h
smlal2 v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN sadalp
mul    v0.4s, v0.4s, v0.4s
sadalp v0.2d, v1.4s
sadalp v0.2d, v1.4s
sadalp v0.2d, v0.4s
# LLVM-MCA-END

# LLVM-MCA-BEGIN ssra
mul  v0.4s, v0.4s, v0.4s
ssra v0.2d, v1.2d, #1
ssra v0.2d, v1.2d, #1
ssra v0.2d, v0.2d, #1
# LLVM-MCA-END

# LLVM-MCA-BEGIN fcmla
fmul  v0.4s, v0.4s, v0.4s
fcmla v0.2d, v1.2d, v2.2d, #90
fcmla v0.2d, v1.2d, v2.2d, #90
fcmla v0.2d, v0.2d, v1.2d, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmla
fmul v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
fadd v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
fmla v0.2d, v1.2d, v2.2d
fmla v0.2d, v0.2d, v1.2d
# LLVM-MCA-END

# LLVM-MCA-BEGIN fmlal
fmul  v0.2d, v0.2d, v0.2d
fmlal v0.4s, v1.4h, v2.4h
fadd  v0.2d, v0.2d, v0.2d
fmlal v0.4s, v1.4h, v2.4h
fmlal v0.4s, v1.4h, v2.4h
fmlal v0.4s, v0.4h, v1.4h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfdot
fmul  v0.2d, v0.2d, v0.2d
bfdot v0.4s, v1.8h, v2.8h
bfdot v0.4s, v1.8h, v2.8h
bfdot v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmmla
fmul   v0.2d, v0.2d, v0.2d
bfmmla v0.4s, v1.8h, v2.8h
bfmmla v0.4s, v1.8h, v2.8h
bfmmla v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmlalb
fmul    v0.2d, v0.2d, v0.2d
bfmlalb v0.4s, v1.8h, v2.8h
bfmlalb v0.4s, v1.8h, v2.8h
bfmlalb v0.4s, v0.8h, v1.8h
# LLVM-MCA-END

# LLVM-MCA-BEGIN crc32b
mul    w0, w0, w0
crc32b w0, w0, w1
crc32b w0, w0, w1
crc32b w0, w0, w0
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z saba
mul  z0.d, z0.d, z0.d
saba z0.d, z1.d, z2.d
saba z0.d, z1.d, z2.d
saba z0.d, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sadalp
mul    z0.d, z0.d, z0.d
sadalp z0.d, p0/m, z1.s
sadalp z0.d, p0/m, z1.s
sadalp z0.d, p0/m, z0.s
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z ssra
mul  z0.d, z0.d, z0.d
ssra z0.d, z1.d, #1
ssra z0.d, z1.d, #1
ssra z0.d, z0.d, #1
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z cdot.s
mul  z0.d, z0.d, z0.d
cdot z0.s, z1.b, z2.b, #90
cdot z0.s, z1.b, z2.b, #90
cdot z0.s, z0.b, z1.b, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z cdot.d
mul  z0.d, z0.d, z0.d
cdot z0.d, z1.h, z2.h, #90
cdot z0.d, z1.h, z2.h, #90
cdot z0.d, z0.h, z1.h, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z cmla.b
mul  z0.d, z0.d, z0.d
cmla z0.b, z1.b, z2.b, #90
cmla z0.b, z1.b, z2.b, #90
cmla z0.b, z0.b, z1.b, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z cmla.d
mul  z0.d, z0.d, z0.d
cmla z0.d, z1.d, z2.d, #90
cmla z0.d, z1.d, z2.d, #90
cmla z0.d, z0.d, z1.d, #90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sdot.s
mul  z0.d, z0.d, z0.d
sdot z0.s, z1.b, z2.b
sdot z0.s, z1.b, z2.b
sdot z0.s, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sudot
mul  z0.d, z0.d, z0.d
sdot z0.s, z1.b, z2.b[1]
sdot z0.s, z1.b, z2.b[1]
sdot z0.s, z0.b, z1.b[1]
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sdot.d
mul  z0.d, z0.d, z0.d
sdot z0.d, z1.h, z2.h
sdot z0.d, z1.h, z2.h
sdot z0.d, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z smmla
mul   z0.s, z0.s, z0.s
smmla z0.s, z1.b, z2.b
smmla z0.s, z1.b, z2.b
smmla z0.s, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z mla.b
mul z0.d, z0.d, z0.d
mla z0.b, p0/m, z1.b, z2.b
mla z0.b, p0/m, z1.b, z2.b
mla z0.b, p0/m, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z mla.d
mul z0.d, z0.d, z0.d
mla z0.d, p0/m, z1.d, z2.d
mla z0.d, p0/m, z1.d, z2.d
mla z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z smlalb
mul    z0.d, z0.d, z0.d
smlalb z0.d, z1.s, z2.s
smlalb z0.d, z1.s, z2.s
smlalb z0.d, z0.s, z1.s
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sqdmlalb
mul      z0.d, z0.d, z0.d
sqdmlalb z0.d, z1.s, z2.s
sqdmlalb z0.d, z1.s, z2.s
sqdmlalb z0.d, z0.s, z1.s
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sqrdmlah.b
mul      z0.d, z0.d, z0.d
sqrdmlah z0.b, z1.b, z2.b
sqrdmlah z0.b, z1.b, z2.b
sqrdmlah z0.b, z0.b, z1.b
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z sqrdmlah.d
mul      z0.d, z0.d, z0.d
sqrdmlah z0.d, z1.d, z2.d
sqrdmlah z0.d, z1.d, z2.d
sqrdmlah z0.d, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fcmla ZPmZZ
fmul  z0.d, z0.d, z0.d
fcmla z0.d, p0/m, z1.d, z2.d, 90
fcmla z0.d, p0/m, z1.d, z2.d, 90
fcmla z0.d, p0/m, z0.d, z1.d, 90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fcmla ZZZI
fmul  z0.d, z0.d, z0.d
fcmla z0.s, z1.s, z2.s[1], 90
fcmla z0.s, z1.s, z2.s[1], 90
fcmla z0.s, z0.s, z1.s[1], 90
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fmla ZPmZZ
fmul z0.d, z0.d, z0.d
fmla z0.d, p0/m, z1.d, z2.d
fmla z0.d, p0/m, z1.d, z2.d
fmla z0.d, p0/m, z0.d, z1.d
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fmla ZZZI
fmul z0.d, z0.d, z0.d
fmla z0.d, z1.d, z2.d[1]
fmla z0.d, z1.d, z2.d[1]
fmla z0.d, z0.d, z1.d[1]
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z fmlalb ZZZ
fmul   z0.d, z0.d, z0.d
fmlalb z0.s, z1.h, z2.h
fmlalb z0.s, z1.h, z2.h
fmlalb z0.s, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z bfdot
fmul  z0.d, z0.d, z0.d
bfdot z0.s, z1.h, z2.h
bfdot z0.s, z1.h, z2.h
bfdot z0.s, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN Z bfmmla
fmul   z0.d, z0.d, z0.d
bfmmla z0.s, z1.h, z2.h
bfmmla z0.s, z1.h, z2.h
bfmmla z0.s, z0.h, z1.h
# LLVM-MCA-END

# LLVM-MCA-BEGIN bfmlalb
fmul    z0.d, z0.d, z0.d
bfmlalb z0.s, z1.h, z2.h
bfmlalb z0.s, z1.h, z2.h
bfmlalb z0.s, z0.h, z1.h
# LLVM-MCA-END

# CHECK:      [0] Code Region - madd

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [1] Code Region - smaddl

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [2] Code Region - fmadd

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.35
# CHECK-NEXT: IPC:               0.35
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeER.    .    .    .    .    .    ..   fadd	d0, d0, d0
# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    ..   fmul	d0, d0, d0
# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
# CHECK-NEXT:        2     15.7   0.1    0.0       <total>

# CHECK:      [3] Code Region - saba

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [4] Code Region - sdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1103
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.36
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          01234

# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
# CHECK-NEXT:        2     10.8   0.1    0.0       <total>

# CHECK:      [5] Code Region - smmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1103
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.36
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          01234

# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
# CHECK-NEXT:        2     10.8   0.1    0.0       <total>

# CHECK:      [6] Code Region - mla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [7] Code Region - sqrdmlah

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.29
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 3.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sqrdmlah	v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0]     D==============eeeeER    .    .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D==================eeeeER.    .   sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,2]     D====================eeeeER   .   sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: [1,3]     D========================eeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [8] Code Region - smlal2

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [9] Code Region - sadalp

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [10] Code Region - ssra

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [11] Code Region - fcmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [12] Code Region - fmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      1703
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.35
# CHECK-NEXT: IPC:               0.35
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,2]     D=====eeER.    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
# CHECK-NEXT:        2     15.3   0.1    0.0       <total>

# CHECK:      [13] Code Region - fmlal

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      600
# CHECK-NEXT: Total Cycles:      1903
# CHECK-NEXT: Total uOps:        600

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.32
# CHECK-NEXT: IPC:               0.32
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456789
# CHECK-NEXT: Index     0123456789          0123456789          0

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,4]     D===========eeeeER  .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,5]     D===============eeeeER   .    .    .    .   fmlal	v0.4s, v0.4h, v1.4h
# CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D======================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,2]     D==========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,3]     D============================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,4]     D==============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [1,5]     D==================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 4.     2     21.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
# CHECK-NEXT: 5.     2     25.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
# CHECK-NEXT:        2     18.0   0.1    0.0       <total>

# CHECK:      [14] Code Region - bfdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1603
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.25
# CHECK-NEXT: IPC:               0.25
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          01234
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     14.0   0.1    0.0       <total>

# CHECK:      [15] Code Region - bfmmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1903
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.21
# CHECK-NEXT: IPC:               0.21
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456789
# CHECK-NEXT: Index     0123456789          0123456789          0

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeeeER   .    .    .    .    .    .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .    .    .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     16.3   0.1    0.0       <total>

# CHECK:      [16] Code Region - bfmlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	v0.4s, v0.8h, v1.8h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [17] Code Region - crc32b

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      703
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.57
# CHECK-NEXT: IPC:               0.57
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456
# CHECK-NEXT: Index     0123456789

# CHECK:      [0,0]     DeeER.    .    ..   mul	w0, w0, w0
# CHECK-NEXT: [0,1]     D==eeER   .    ..   crc32b	w0, w0, w1
# CHECK-NEXT: [0,2]     D===eeER  .    ..   crc32b	w0, w0, w1
# CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32b	w0, w0, w0
# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
# CHECK-NEXT: [1,1]     D=========eeER ..   crc32b	w0, w0, w1
# CHECK-NEXT: [1,2]     D==========eeER..   crc32b	w0, w0, w1
# CHECK-NEXT: [1,3]     D============eeER   crc32b	w0, w0, w0

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32b	w0, w0, w1
# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32b	w0, w0, w1
# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32b	w0, w0, w0
# CHECK-NEXT:        2     7.0    0.1    0.0       <total>

# CHECK:      [18] Code Region - Z saba

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   saba	z0.d, z1.d, z2.d
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   saba	z0.d, z1.d, z2.d
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   saba	z0.d, z0.d, z1.d
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   saba	z0.d, z1.d, z2.d
# CHECK-NEXT: [1,2]     D====================eeeeER   .   saba	z0.d, z1.d, z2.d
# CHECK-NEXT: [1,3]     D========================eeeeER   saba	z0.d, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	z0.d, z1.d, z2.d
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       saba	z0.d, z1.d, z2.d
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       saba	z0.d, z0.d, z1.d
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [19] Code Region - Z sadalp

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sadalp	z0.d, p0/m, z0.s
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: [1,2]     D====================eeeeER   .   sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: [1,3]     D========================eeeeER   sadalp	z0.d, p0/m, z0.s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sadalp	z0.d, p0/m, z0.s
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [20] Code Region - Z ssra

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 1.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   ssra	z0.d, z1.d, #1
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   ssra	z0.d, z1.d, #1
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   ssra	z0.d, z0.d, #1
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   ssra	z0.d, z1.d, #1
# CHECK-NEXT: [1,2]     D====================eeeeER   .   ssra	z0.d, z1.d, #1
# CHECK-NEXT: [1,3]     D========================eeeeER   ssra	z0.d, z0.d, #1

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       ssra	z0.d, z1.d, #1
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       ssra	z0.d, z1.d, #1
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       ssra	z0.d, z0.d, #1
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [21] Code Region - Z cdot.s

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   cdot	z0.s, z0.b, z1.b, #90
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [1,2]     D==================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [1,3]     D=====================eeeER   cdot	z0.s, z0.b, z1.b, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [22] Code Region - Z cdot.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cdot	z0.d, z0.h, z1.h, #90
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [1,2]     D====================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [1,3]     D========================eeeeER   cdot	z0.d, z0.h, z1.h, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [23] Code Region - Z cmla.b

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cmla	z0.b, z0.b, z1.b, #90
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [1,2]     D====================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [1,3]     D========================eeeeER   cmla	z0.b, z0.b, z1.b, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [24] Code Region - Z cmla.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1803
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.28
# CHECK-NEXT: IPC:               0.22
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012345678
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   cmla	z0.d, z0.d, z1.d, #90
# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [1,3]     D===============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     15.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: 2.     2     18.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
# CHECK-NEXT: 3.     2     23.0   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
# CHECK-NEXT:        2     16.5   0.1    0.0       <total>

# CHECK:      [25] Code Region - Z sdot.s

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [26] Code Region - Z sudot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1203
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.42
# CHECK-NEXT: IPC:               0.33
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          0123456

# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
# CHECK-NEXT:        2     12.0   0.1    0.0       <total>

# CHECK:      [27] Code Region - Z sdot.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [28] Code Region - Z smmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1103
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.36
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          01234

# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	z0.s, z0.s, z0.s
# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	z0.s, z0.b, z1.b
# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: [1,3]     D===================eeeER   smmla	z0.s, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.s, z0.s, z0.s
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
# CHECK-NEXT:        2     10.8   0.1    0.0       <total>

# CHECK:      [29] Code Region - Z mla.b

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   mla	z0.b, p0/m, z0.b, z1.b
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [1,2]     D====================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [1,3]     D========================eeeeER   mla	z0.b, p0/m, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [30] Code Region - Z mla.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1803
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.28
# CHECK-NEXT: IPC:               0.22
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012345678
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   mla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     D===============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     15.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     18.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     23.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     16.5   0.1    0.0       <total>

# CHECK:      [31] Code Region - Z smlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1403
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.36
# CHECK-NEXT: IPC:               0.29
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   smlalb	z0.d, z0.s, z1.s
# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeER    .   smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [1,2]     D====================eeeeER   .   smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [1,3]     D========================eeeeER   smlalb	z0.d, z0.s, z1.s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: 2.     2     14.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: 3.     2     18.0   0.0    0.0       smlalb	z0.d, z0.s, z1.s
# CHECK-NEXT:        2     13.3   0.1    0.0       <total>

# CHECK:      [32] Code Region - Z sqdmlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.33
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqdmlalb	z0.d, z0.s, z1.s
# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: [1,3]     D==========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
# CHECK-NEXT:        2     14.3   0.1    0.0       <total>

# CHECK:      [33] Code Region - Z sqrdmlah.b

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.33
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 2.5

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqrdmlah	z0.b, z0.b, z1.b
# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: [1,3]     D==========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
# CHECK-NEXT:        2     14.3   0.1    0.0       <total>

# CHECK:      [34] Code Region - Z sqrdmlah.d

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1803
# CHECK-NEXT: Total uOps:        500

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.28
# CHECK-NEXT: IPC:               0.22
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012345678
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   sqrdmlah	z0.d, z0.d, z1.d
# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: [1,3]     D===============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     15.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: 2.     2     18.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
# CHECK-NEXT: 3.     2     23.0   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
# CHECK-NEXT:        2     16.5   0.1    0.0       <total>

# CHECK:      [35] Code Region - Z fcmla ZPmZZ

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [36] Code Region - Z fcmla ZZZI

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>

# CHECK:      [37] Code Region - Z fmla ZPmZZ

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [38] Code Region - Z fmla ZZZI

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [39] Code Region - Z fmlalb ZZZ

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1303
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.31
# CHECK-NEXT: IPC:               0.31
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789
# CHECK-NEXT: Index     0123456789          012345678

# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D======================eeeeER   fmlalb	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     11.8   0.1    0.0       <total>

# CHECK:      [40] Code Region - Z bfdot

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1603
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.25
# CHECK-NEXT: IPC:               0.25
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          01234
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     14.0   0.1    0.0       <total>

# CHECK:      [41] Code Region - Z bfmmla

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1903
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.21
# CHECK-NEXT: IPC:               0.21
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          0123456789
# CHECK-NEXT: Index     0123456789          0123456789          0

# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeeER   .    .    .    .    .    .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .    .    .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     16.3   0.1    0.0       <total>

# CHECK:      [42] Code Region - bfmlalb

# CHECK:      Iterations:        100
# CHECK-NEXT: Instructions:      400
# CHECK-NEXT: Total Cycles:      1503
# CHECK-NEXT: Total uOps:        400

# CHECK:      Dispatch Width:    16
# CHECK-NEXT: uOps Per Cycle:    0.27
# CHECK-NEXT: IPC:               0.27
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK:      Timeline view:
# CHECK-NEXT:                     0123456789          012
# CHECK-NEXT: Index     0123456789          0123456789

# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h

# CHECK:      Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK:            [0]    [1]    [2]    [3]
# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
# CHECK-NEXT:        2     13.0   0.1    0.0       <total>