llvm/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir

# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s

--- |
  define amdgpu_kernel void @largeInterleave() #0 { ret void }
  ; GCN-LABEL: largeInterleave:
  ; GCN:       ; %bb.0:
  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
  ; GCN-NEXT:    ; implicit-def: $vgpr0
  ; GCN-NEXT:    ; implicit-def: $vgpr2
  ; GCN-NEXT:    ; implicit-def: $vgpr1
  ; GCN-NEXT:    ; implicit-def: $vgpr8
  ; GCN-NEXT:    ; implicit-def: $vgpr94
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    ; implicit-def: $vgpr106
  ; GCN-NEXT:    ; implicit-def: $vgpr132
  ; GCN-NEXT:    ; implicit-def: $vgpr133
  ; GCN-NEXT:    ; implicit-def: $vgpr139
  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
  ; GCN-NEXT:    ; implicit-def: $sgpr0
  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
  ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
  ; GCN-NEXT:    ; implicit-def: $sgpr5
  ; GCN-NEXT:    s_nop 1
  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
  ; GCN-NEXT:    ; kill: killed $vgpr8
  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
  ; GCN-NEXT:    ; kill: killed $vgpr92
  ; GCN-NEXT:    ; implicit-def: $sgpr6
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    ; kill: killed $vgpr72
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
  ; GCN-NEXT:    ; implicit-def: $vgpr64
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
  ; GCN-NEXT:    ; implicit-def: $vgpr73
  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ; kill: killed $vgpr72
  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
  ; GCN-NEXT:    ; implicit-def: $vgpr74
  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
  ; GCN-NEXT:    ; implicit-def: $vgpr75
  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ; kill: killed $vgpr76
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    ; implicit-def: $sgpr8
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
  ; GCN-NEXT:    s_nop 5
  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
  ; GCN-NEXT:    ; implicit-def: $sgpr6
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
  ; GCN-NEXT:    s_nop 6
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
  ; GCN-NEXT:    ; implicit-def: $vgpr65
  ; GCN-NEXT:    ; implicit-def: $vgpr66
  ; GCN-NEXT:    ; implicit-def: $vgpr68
  ; GCN-NEXT:    ; implicit-def: $vgpr67
  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
  ; GCN-NEXT:    v_add_lshl_u32 v135, v66, v65, 1
  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
  ; GCN-NEXT:    ; implicit-def: $vgpr66
  ; GCN-NEXT:    v_lshl_add_u32 v136, v66, 1, v135
  ; GCN-NEXT:    ; implicit-def: $vgpr66
  ; GCN-NEXT:    v_lshl_add_u32 v137, v66, 1, v136
  ; GCN-NEXT:    ; implicit-def: $vgpr66
  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
  ; GCN-NEXT:    v_lshl_add_u32 v138, v66, 1, v137
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v135, v[94:95]
  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v136, v[98:99]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v137, v[102:103]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
  ; GCN-NEXT:    ; implicit-def: $vgpr65
  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
  ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
  ; GCN-NEXT:    ; implicit-def: $vgpr64
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ; implicit-def: $vgpr66
  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
  ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
  ; GCN-NEXT:    v_fma_f32 v68, s4, v52, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
  ; GCN-NEXT:    v_fma_f32 v69, s4, v53, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    v_fma_f32 v70, s4, v54, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v53, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
  ; GCN-NEXT:    v_fma_f32 v71, s4, v55, -v134
  ; GCN-NEXT:    ds_read_b128 v[140:143], v139
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
  ; GCN-NEXT:    v_exp_f32_e32 v55, v48
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
  ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
  ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
  ; GCN-NEXT:    ; implicit-def: $vgpr57
  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
  ; GCN-NEXT:    v_exp_f32_e32 v151, v33
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
  ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
  ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
  ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
  ; GCN-NEXT:    ; implicit-def: $vgpr33
  ; GCN-NEXT:    ; implicit-def: $vgpr38
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
  ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ; implicit-def: $vgpr36
  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
  ; GCN-NEXT:    ; implicit-def: $vgpr37
  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
  ; GCN-NEXT:    ds_read_b128 v[36:39], v139
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
  ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
  ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
  ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
  ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
  ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v130, v37
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
  ; GCN-NEXT:    v_exp_f32_e32 v144, v22
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
  ; GCN-NEXT:    ; implicit-def: $vgpr17
  ; GCN-NEXT:    ; implicit-def: $vgpr22
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
  ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
  ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
  ; GCN-NEXT:    ; implicit-def: $vgpr20
  ; GCN-NEXT:    ; implicit-def: $vgpr21
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
  ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
  ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
  ; GCN-NEXT:    s_waitcnt vmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
  ; GCN-NEXT:    v_exp_f32_e32 v132, v16
  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
  ; GCN-NEXT:    ds_read_b128 v[20:23], v139
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[36:39], v139 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v62, v16
  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
  ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
  ; GCN-NEXT:    ; implicit-def: $sgpr0
  ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
  ; GCN-NEXT:    v_exp_f32_e32 v145, v16
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
  ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
  ; GCN-NEXT:    v_exp_f32_e32 v35, v16
  ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v34, s4, v27, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
  ; GCN-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v46, v20
  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v132
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v47, v20
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v62
  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v34
  ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
  ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
  ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
  ; GCN-NEXT:    v_exp_f32_e32 v33, v20
  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
  ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v36, v24
  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
  ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
  ; GCN-NEXT:    v_exp_f32_e32 v39, v24
  ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
  ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
  ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v34, v1
  ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
  ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v38, v21
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
  ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    ds_write_b64 v135, v[4:5]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
  ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
  ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
  ; GCN-NEXT:    v_exp_f32_e32 v42, v1
  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
  ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_read_b128 v[4:7], v139
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
  ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_fma_f32 v28, s4, v9, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
  ; GCN-NEXT:    v_exp_f32_e32 v29, v0
  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v41
  ; GCN-NEXT:    v_fma_f32 v30, s4, v10, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v42
  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
  ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
  ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
  ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_exp_f32_e32 v24, v4
  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
  ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
  ; GCN-NEXT:    v_exp_f32_e32 v30, v0
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
  ; GCN-NEXT:    v_exp_f32_e32 v16, v4
  ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
  ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v8
  ; GCN-NEXT:    v_exp_f32_e32 v10, v1
  ; GCN-NEXT:    v_pack_b32_f16 v8, v17, v20
  ; GCN-NEXT:    v_pack_b32_f16 v9, v3, v0
  ; GCN-NEXT:    v_add_f32_e32 v3, 0, v49
  ; GCN-NEXT:    v_add_f32_e32 v3, v50, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v51, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v52, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v53, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v145, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
  ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
  ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
  ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
  ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
  ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
  ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
  ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
  ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
  ; GCN-NEXT:    ; implicit-def: $vgpr4
  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    buffer_inv sc0 sc1
  ; GCN-NEXT:    ;;#ASMSTART
  ; GCN-NEXT:    s_waitcnt vmcnt(8)
  ; GCN-NEXT:    ;;#ASMEND
  ; GCN-NEXT:    s_endpgm
  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
  !0 = !{i64 2862105}

...

---
name:            largeInterleave
tracksRegLiveness: true
machineFunctionInfo:
  stackPtrOffsetReg: '$sgpr32'
  occupancy:       7
body:             |
  bb.0:
    liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
    %11:vgpr_32 = IMPLICIT_DEF
    %1:sgpr_512 = IMPLICIT_DEF
    %16:vgpr_32 = IMPLICIT_DEF
    %443:sgpr_128 = IMPLICIT_DEF
    %18:sreg_32 = IMPLICIT_DEF
    %25:vgpr_32 = IMPLICIT_DEF
    %23:vgpr_32 = IMPLICIT_DEF
    %391:vreg_128_align2 = IMPLICIT_DEF
    %24:vgpr_32 = IMPLICIT_DEF
    %392:vreg_128_align2 = IMPLICIT_DEF
    %401:vreg_128_align2 = IMPLICIT_DEF
    %406:vreg_128_align2 = IMPLICIT_DEF
    %48:vgpr_32 = IMPLICIT_DEF
    %473:sgpr_128 = IMPLICIT_DEF
    %411:vreg_128_align2 = IMPLICIT_DEF
    %416:vreg_128_align2 = IMPLICIT_DEF
    %421:vreg_128_align2 = IMPLICIT_DEF
    %426:vreg_128_align2 = IMPLICIT_DEF
    %1114:sgpr_32 = IMPLICIT_DEF
    %39:vgpr_32 = IMPLICIT_DEF
    %484:sreg_64_xexec = IMPLICIT_DEF
    %3346:vgpr_32 = IMPLICIT_DEF
    %1422:sreg_32 = IMPLICIT_DEF
    %1424:sreg_32 = IMPLICIT_DEF
    %15:vgpr_32 = IMPLICIT_DEF
    %494:sreg_32 = IMPLICIT_DEF
    %47:vgpr_32 = IMPLICIT_DEF
    %41:vgpr_32 = IMPLICIT_DEF
    %42:vgpr_32 = IMPLICIT_DEF
    %43:vgpr_32 = IMPLICIT_DEF
    %44:vgpr_32 = IMPLICIT_DEF
    %45:vgpr_32 = IMPLICIT_DEF
    %50:sreg_32 = IMPLICIT_DEF
    %3347:vgpr_32 = IMPLICIT_DEF
    %3329:vgpr_32 = IMPLICIT_DEF
    %3330:vgpr_32 = IMPLICIT_DEF
    %3331:vgpr_32 = IMPLICIT_DEF
    %3332:vgpr_32 = IMPLICIT_DEF
    %3333:vgpr_32 = IMPLICIT_DEF
    %2986:vreg_512_align2 = IMPLICIT_DEF
    %3038:vreg_512_align2 = IMPLICIT_DEF
    %2980:vreg_512_align2 = IMPLICIT_DEF
    %3003:vreg_512_align2 = IMPLICIT_DEF
    %3334:vgpr_32 = IMPLICIT_DEF
    %3335:vgpr_32 = IMPLICIT_DEF
    %3336:vgpr_32 = IMPLICIT_DEF
    %3337:vgpr_32 = IMPLICIT_DEF
    %3338:vgpr_32 = IMPLICIT_DEF
    %3339:vgpr_32 = IMPLICIT_DEF
    %3345:vgpr_32 = IMPLICIT_DEF
    %3340:vgpr_32 = IMPLICIT_DEF
    %3341:vgpr_32 = IMPLICIT_DEF
    %3342:vgpr_32 = IMPLICIT_DEF
    %3343:vgpr_32 = IMPLICIT_DEF
    %3344:vgpr_32 = IMPLICIT_DEF
    %84:vgpr_32 = COPY %3347
    %86:vgpr_32 = COPY %3347:vgpr_32
    IGLP_OPT 2
    %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec
    %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec
    %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec
    %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec
    %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
    %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec
    %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
    %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc
    %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec
    DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec
    DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec
    %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec
    %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec
    %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
    early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
    early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
    early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
    early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec
    DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec
    %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec
    %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec
    %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec
    DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec
    %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec
    %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec
    %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
    %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec
    %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec
    %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec
    %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec
    %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec
    DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec
    %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec
    %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec
    %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec
    %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec
    %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec
    %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec
    %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec
    %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec
    %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec
    %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec
    %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec
    %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec
    %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec
    %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec
    %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec
    %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec
    %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec
    %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec
    %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec
    %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec
    %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec
    %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec
    %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec
    %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec
    %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec
    %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec
    %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec
    %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec
    %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec
    %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec
    %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec
    %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec
    %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec
    %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec
    %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec
    %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec
    %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec
    %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec
    %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec
    %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec
    %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec
    %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec
    %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec
    %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec
    %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec
    %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec
    %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec
    %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec
    %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec
    %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec
    %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec
    %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec
    %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec
    %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec
    %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec
    %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec
    %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec
    %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec
    %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec
    %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec
    %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec
    %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec
    %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec
    %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec
    %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec
    %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec
    %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec
    %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec
    %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec
    %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec
    %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec
    %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec
    %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec
    %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec
    %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec
    %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec
    %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec
    %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec
    %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec
    %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec
    %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec
    %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec
    %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec
    %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec
    %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec
    %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec
    %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec
    %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec
    %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec
    %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec
    %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec
    %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec
    %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec
    %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec
    %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec
    %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec
    %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec
    %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec
    %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec
    %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec
    %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec
    %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec
    %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec
    %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec
    %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec
    %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec
    %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec
    %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec
    %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec
    %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec
    %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec
    %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec
    %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec
    %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec
    %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec
    %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec
    %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec
    %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec
    %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec
    %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec
    %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec
    %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec
    %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec
    %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec
    %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec
    %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec
    %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec
    %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec
    %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec
    %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec
    %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec
    %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec
    %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec
    %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec
    %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec
    %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec
    %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec
    %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec
    %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec
    %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec
    %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec
    %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec
    %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec
    %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec
    %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec
    %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec
    %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec
    %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec
    %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec
    %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec
    %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec
    %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec
    %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec
    %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec
    %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec
    %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec
    %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec
    %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec
    %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec
    %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec
    %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec
    %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec
    %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec
    %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec
    %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec
    %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec
    %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec
    %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec
    %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec
    %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec
    %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec
    %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec
    %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec
    %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec
    %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec
    %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec
    %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec
    %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec
    %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec
    %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec
    %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec
    %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec
    %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec
    %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec
    %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec
    %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec
    %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec
    %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec
    %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec
    %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec
    %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec
    %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec
    %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec
    %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec
    %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec
    %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec
    %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec
    %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec
    %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec
    %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec
    %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec
    %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec
    %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec
    %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec
    %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec
    %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec
    %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec
    %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec
    %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec
    %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec
    %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec
    %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec
    %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec
    %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec
    %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec
    %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec
    %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec
    %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec
    %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec
    %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec
    %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec
    %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec
    %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec
    %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec
    %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec
    %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec
    %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec
    %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec
    %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec
    %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec
    %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec
    %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec
    %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec
    %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec
    %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec
    %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec
    %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec
    %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec
    %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec
    %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec
    %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec
    %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec
    %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec
    %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec
    %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec
    %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec
    %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec
    %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec
    %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec
    %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec
    %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec
    %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec
    %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec
    %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec
    %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec
    %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec
    %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec
    %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec
    %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec
    %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec
    %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec
    %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec
    %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec
    %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec
    %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec
    %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec
    %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec
    %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec
    %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec
    undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec
    %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec
    %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec
    %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec
    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec
    %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec
    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec
    %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec
    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec
    %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec
    DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec
    %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec
    %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
    %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec
    %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec
    %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec
    %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec
    %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec
    %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec
    %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec
    %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec
    %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec
    %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec
    %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec
    %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec
    %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec
    %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec
    %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec
    %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec
    %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec
    %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec
    %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec
    %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec
    %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec
    %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec
    %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec
    %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec
    %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec
    %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec
    %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec
    %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec
    %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec
    %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec
    %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec
    %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec
    %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec
    %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec
    %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec
    %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec
    %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec
    %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec
    %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec
    %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec
    %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
    %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
    %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
    %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
    %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
    %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
    %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
    %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec
    %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec
    %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec
    %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec
    %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec
    %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
    %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
    %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
    %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
    %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
    %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
    %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
    %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec
    %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec
    %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec
    %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec
    %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec
    %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
    %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
    %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
    %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
    %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
    %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
    %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
    %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
    undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
    %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec
    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec
    %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec
    %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec
    %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec
    %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec
    %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec
    %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec
    %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec
    %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec
    %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec
    %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec
    %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec
    %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec
    undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec
    %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec
    %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec
    %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec
    %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec
    %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec
    %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec
    %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec
    %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec
    %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec
    %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec
    %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec
    %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec
    %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec
    %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec
    %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec
    undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec
    %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
    %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec
    %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec
    %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec
    %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec
    %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec
    %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec
    %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec
    %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec
    %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec
    %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec
    %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec
    %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec
    %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec
    %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec
    %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec
    %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec
    %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec
    S_ENDPGM 0
...