; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s
; ModuleID = 'kernel_round1_passing.bc'
source_filename = "/tmp/comgr-295d04/input/CompileSource"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
@kernel_round1.first_words_data = external hidden unnamed_addr addrspace(3) global [896 x i8], align 1
@kernel_round1.collisionsData = external hidden unnamed_addr addrspace(3) global [3840 x i32], align 4
@kernel_round1.collisionsNum = external hidden addrspace(3) global i32, align 4
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare hidden i64 @_Z13get_global_idj(i32 noundef) local_unnamed_addr #0
; Function Attrs: convergent nounwind
declare hidden i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
; Function Attrs: convergent nounwind
declare hidden i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef, i32 noundef) local_unnamed_addr #1
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare hidden i64 @_Z12get_local_idj(i32 noundef) local_unnamed_addr #0
; Function Attrs: convergent nounwind
declare hidden void @_Z7barrierj(i32 noundef) local_unnamed_addr #1
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare hidden i32 @_Z3minjj(i32 noundef, i32 noundef) local_unnamed_addr #0
; Function Attrs: convergent nounwind
declare hidden i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef) local_unnamed_addr #1
; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
declare hidden i64 @_Z14get_local_sizej(i32 noundef) local_unnamed_addr #0
; Function Attrs: convergent norecurse nounwind
define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture noundef readonly align 1 %0, ptr addrspace(1) nocapture noundef writeonly align 1 %1, ptr addrspace(1) nocapture noundef readonly align 4 %2, ptr addrspace(1) noundef align 4 %3, ptr addrspace(1) nocapture noundef readnone align 4 %4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
; CHECK-LABEL: kernel_round1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s10, s10, s15
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s11, s11, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx8 s[44:51], s[6:7], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[34:35], s[6:7]
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s34, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s35, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v45, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v41, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: global_load_dword v0, v0, s[48:49]
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
; CHECK-NEXT: v_mov_b32_e32 v1, 12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: s_mov_b32 s42, exec_lo
; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.1: ; %.preheader5
; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
; CHECK-NEXT: s_add_i32 s5, s5, 1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
; CHECK-NEXT: ds_write_b8 v1, v45
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
; CHECK-NEXT: s_mov_b32 s43, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.4:
; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
; CHECK-NEXT: v_mov_b32_e32 v47, 0
; CHECK-NEXT: s_mov_b32 s49, 0
; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
; CHECK-NEXT: v_add_nc_u32_e32 v0, s49, v44
; CHECK-NEXT: s_lshl_b32 s4, s49, 5
; CHECK-NEXT: s_add_i32 s48, s49, 1
; CHECK-NEXT: s_add_i32 s5, s49, 5
; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v56, v0
; CHECK-NEXT: v_mov_b32_e32 v58, s48
; CHECK-NEXT: s_mov_b32 s52, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_17
; CHECK-NEXT: ; %bb.6: ; %.preheader2
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_mov_b32 s53, 0
; CHECK-NEXT: s_mov_b32 s54, 0
; CHECK-NEXT: s_branch .LBB0_8
; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_add_i32 s54, s54, 4
; CHECK-NEXT: s_add_i32 s4, s49, s54
; CHECK-NEXT: v_add_nc_u32_e32 v0, s54, v57
; CHECK-NEXT: s_add_i32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s4, s4, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
; CHECK-NEXT: v_mov_b32_e32 v58, s4
; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: s_cbranch_execz .LBB0_16
; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46
; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57
; CHECK-NEXT: ds_read_u8 v0, v59
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s55, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s55, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_12
; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s55, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_14
; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s55, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_7
; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_16: ; %Flow45
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: .LBB0_17: ; %Flow46
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: s_mov_b32 s49, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_23
; CHECK-NEXT: ; %bb.18: ; %.preheader
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_mov_b32 s52, 0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB0_20
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: v_add_nc_u32_e32 v58, 1, v58
; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v58, v42
; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: s_cbranch_execz .LBB0_22
; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s53, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_19
; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
; CHECK-NEXT: .LBB0_22: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: .LBB0_23: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
; CHECK-NEXT: s_or_b32 s43, s4, s43
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s43
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_25: ; %Flow51
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: ds_read_b32 v47, v0 offset:15360
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmpx_gt_u32_e64 v47, v41
; CHECK-NEXT: s_cbranch_execz .LBB0_33
; CHECK-NEXT: ; %bb.26:
; CHECK-NEXT: s_mov_b32 s42, 0
; CHECK-NEXT: s_branch .LBB0_28
; CHECK-NEXT: .LBB0_27: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z14get_local_sizej@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z14get_local_sizej@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: s_cbranch_execz .LBB0_33
; CHECK-NEXT: .LBB0_28: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
; CHECK-NEXT: s_mov_b32 s43, exec_lo
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
; CHECK-NEXT: v_bfe_u32 v62, v0, 5, 5
; CHECK-NEXT: v_and_b32_e32 v72, 31, v0
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
; CHECK-NEXT: v_xor_b32_e32 v57, v11, v7
; CHECK-NEXT: v_xor_b32_e32 v56, v10, v6
; CHECK-NEXT: v_or_b32_e32 v5, v46, v57
; CHECK-NEXT: v_or_b32_e32 v4, v45, v56
; CHECK-NEXT: v_cmpx_ne_u64_e32 0, v[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_27
; CHECK-NEXT: ; %bb.29: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: v_and_b32_e32 v3, 0xf000, v0
; CHECK-NEXT: v_and_b32_e32 v4, 0xf00, v1
; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0
; CHECK-NEXT: v_and_b32_e32 v1, 15, v1
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_addPU3AS1Vjj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_addPU3AS1Vjj@rel32@hi+12
; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0
; CHECK-NEXT: v_lshlrev_b32_e64 v44, v1, 1
; CHECK-NEXT: v_and_b32_e32 v74, 28, v1
; CHECK-NEXT: v_add_co_u32 v42, s4, s50, v0
; CHECK-NEXT: v_add_co_ci_u32_e64 v43, null, s51, 0, s4
; CHECK-NEXT: v_mov_b32_e32 v2, v44
; CHECK-NEXT: v_mov_b32_e32 v0, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: v_mov_b32_e32 v1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_bfe_u32 v0, v0, v74, 4
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: v_cmpx_gt_u32_e32 12, v0
; CHECK-NEXT: s_xor_b32 s4, exec_lo, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_31
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: v_xor_b32_e32 v4, v60, v58
; CHECK-NEXT: v_lshrrev_b64 v[2:3], 16, v[56:57]
; CHECK-NEXT: v_mad_u64_u32 v[6:7], null, 0x180, v73, s[46:47]
; CHECK-NEXT: v_lshlrev_b32_e32 v10, 5, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72
; CHECK-NEXT: v_lshlrev_b32_e32 v9, 12, v63
; CHECK-NEXT: v_xor_b32_e32 v5, v61, v59
; CHECK-NEXT: v_lshlrev_b32_e32 v11, 16, v56
; CHECK-NEXT: v_or_b32_e32 v3, v1, v3
; CHECK-NEXT: v_lshrrev_b64 v[0:1], 16, v[45:46]
; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, v10
; CHECK-NEXT: v_or3_b32 v8, v8, v9, v62
; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; CHECK-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
; CHECK-NEXT: v_or_b32_e32 v1, v11, v1
; CHECK-NEXT: ; implicit-def: $vgpr42
; CHECK-NEXT: ; implicit-def: $vgpr43
; CHECK-NEXT: ; implicit-def: $vgpr44
; CHECK-NEXT: global_store_dword v[6:7], v8, off offset:4
; CHECK-NEXT: global_store_dwordx4 v[6:7], v[0:3], off offset:8
; CHECK-NEXT: global_store_dwordx2 v[6:7], v[4:5], off offset:24
; CHECK-NEXT: .LBB0_31: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_27
; CHECK-NEXT: ; %bb.32: ; in Loop: Header=BB0_28 Depth=1
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, v42
; CHECK-NEXT: v_mov_b32_e32 v1, v43
; CHECK-NEXT: v_mov_b32_e32 v2, v44
; CHECK-NEXT: s_add_u32 s8, s34, 40
; CHECK-NEXT: s_addc_u32 s9, s35, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_subPU3AS1Vjj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_subPU3AS1Vjj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: s_branch .LBB0_27
; CHECK-NEXT: .LBB0_33:
; CHECK-NEXT: s_endpgm
%6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
%7 = trunc i64 %6 to i32
%8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
%9 = trunc i64 %8 to i32
%10 = mul i32 %9, 14
%11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %10
store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
tail call void @_Z7barrierj(i32 noundef 1) #5
%12 = lshr i64 %6, 3
%13 = shl i32 %7, 2
%14 = and i32 %13, 28
%15 = and i64 %12, 536870911
%16 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %15
%17 = load i32, ptr addrspace(1) %16, align 4, !tbaa !11
%18 = lshr i32 %17, %14
%19 = and i32 %18, 15
%20 = tail call i32 @_Z3minjj(i32 noundef %19, i32 noundef 12) #4
%21 = icmp eq i32 %20, 0
br i1 %21, label %119, label %27
22: ; preds = %27
%23 = add i32 %20, -1
%24 = icmp eq i32 %23, 0
br i1 %24, label %119, label %25
25: ; preds = %22
%26 = shl i32 %7, 10
br label %37
27: ; preds = %5, %27
%28 = phi i32 [ %30, %27 ], [ 0, %5 ]
%29 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %28
store i8 0, ptr addrspace(3) %29, align 1, !tbaa !15
%30 = add nuw i32 %28, 1
%31 = icmp eq i32 %30, %20
br i1 %31, label %22, label %27
32: ; preds = %114, %48
%33 = phi i32 [ %50, %48 ], [ %115, %114 ]
%34 = icmp ult i32 %44, %23
%35 = icmp ult i32 %33, 60
%36 = select i1 %34, i1 %35, i1 false
br i1 %36, label %37, label %119
37: ; preds = %32, %25
%38 = phi i32 [ 0, %25 ], [ %44, %32 ]
%39 = phi i32 [ 0, %25 ], [ %33, %32 ]
%40 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %38
%41 = load i8, ptr addrspace(3) %40, align 1, !tbaa !15
%42 = shl i32 %38, 5
%43 = or i32 %42, %26
%44 = add nuw i32 %38, 1
%45 = or i32 %43, %44
%46 = add i32 %38, 5
%47 = icmp ult i32 %46, %20
br i1 %47, label %53, label %48
48: ; preds = %98, %37
%49 = phi i32 [ %45, %37 ], [ %100, %98 ]
%50 = phi i32 [ %39, %37 ], [ %99, %98 ]
%51 = phi i32 [ %44, %37 ], [ %54, %98 ]
%52 = icmp ult i32 %51, %20
br i1 %52, label %103, label %32
53: ; preds = %37, %98
%54 = phi i32 [ %101, %98 ], [ %46, %37 ]
%55 = phi i32 [ %54, %98 ], [ %44, %37 ]
%56 = phi i32 [ %99, %98 ], [ %39, %37 ]
%57 = phi i32 [ %100, %98 ], [ %45, %37 ]
%58 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %55
%59 = load i8, ptr addrspace(3) %58, align 1, !tbaa !15
%60 = icmp eq i8 %41, %59
br i1 %60, label %61, label %65
61: ; preds = %53
%62 = add i32 %56, 1
%63 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%64 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %63
store i32 %57, ptr addrspace(3) %64, align 4, !tbaa !11
br label %65
65: ; preds = %61, %53
%66 = phi i32 [ %62, %61 ], [ %56, %53 ]
%67 = add i32 %55, 1
%68 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %67
%69 = load i8, ptr addrspace(3) %68, align 1, !tbaa !15
%70 = icmp eq i8 %41, %69
br i1 %70, label %71, label %76
71: ; preds = %65
%72 = add i32 %57, 1
%73 = add i32 %66, 1
%74 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%75 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %74
store i32 %72, ptr addrspace(3) %75, align 4, !tbaa !11
br label %76
76: ; preds = %71, %65
%77 = phi i32 [ %73, %71 ], [ %66, %65 ]
%78 = add i32 %55, 2
%79 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %78
%80 = load i8, ptr addrspace(3) %79, align 1, !tbaa !15
%81 = icmp eq i8 %41, %80
br i1 %81, label %82, label %87
82: ; preds = %76
%83 = add i32 %57, 2
%84 = add i32 %77, 1
%85 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%86 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %85
store i32 %83, ptr addrspace(3) %86, align 4, !tbaa !11
br label %87
87: ; preds = %82, %76
%88 = phi i32 [ %84, %82 ], [ %77, %76 ]
%89 = add i32 %55, 3
%90 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %89
%91 = load i8, ptr addrspace(3) %90, align 1, !tbaa !15
%92 = icmp eq i8 %41, %91
br i1 %92, label %93, label %98
93: ; preds = %87
%94 = add i32 %57, 3
%95 = add i32 %88, 1
%96 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%97 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %96
store i32 %94, ptr addrspace(3) %97, align 4, !tbaa !11
br label %98
98: ; preds = %93, %87
%99 = phi i32 [ %95, %93 ], [ %88, %87 ]
%100 = add i32 %57, 4
%101 = add i32 %54, 4
%102 = icmp ult i32 %101, %20
br i1 %102, label %53, label %48
103: ; preds = %48, %114
%104 = phi i32 [ %117, %114 ], [ %51, %48 ]
%105 = phi i32 [ %115, %114 ], [ %50, %48 ]
%106 = phi i32 [ %116, %114 ], [ %49, %48 ]
%107 = getelementptr inbounds i8, ptr addrspace(3) %11, i32 %104
%108 = load i8, ptr addrspace(3) %107, align 1, !tbaa !15
%109 = icmp eq i8 %41, %108
br i1 %109, label %110, label %114
110: ; preds = %103
%111 = add i32 %105, 1
%112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %112
store i32 %106, ptr addrspace(3) %113, align 4, !tbaa !11
br label %114
114: ; preds = %110, %103
%115 = phi i32 [ %111, %110 ], [ %105, %103 ]
%116 = add i32 %106, 1
%117 = add nuw i32 %104, 1
%118 = icmp ult i32 %117, %20
br i1 %118, label %103, label %32
119: ; preds = %32, %22, %5
tail call void @_Z7barrierj(i32 noundef 1) #5
%120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
%121 = icmp ugt i32 %120, %9
br i1 %121, label %122, label %206
122: ; preds = %119
%123 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
br label %124
124: ; preds = %201, %122
%125 = phi i32 [ %9, %122 ], [ %204, %201 ]
%126 = phi i64 [ %8, %122 ], [ %203, %201 ]
%127 = and i64 %126, 4294967295
%128 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %125
%129 = load i32, ptr addrspace(3) %128, align 4, !tbaa !11
%130 = lshr i32 %129, 10
%131 = lshr i32 %129, 5
%132 = and i32 %131, 31
%133 = and i32 %129, 31
%134 = mul nuw nsw i32 %130, 384
%135 = zext i32 %134 to i64
%136 = getelementptr inbounds i8, ptr addrspace(1) %123, i64 %135
%137 = shl nuw nsw i32 %132, 5
%138 = zext i32 %137 to i64
%139 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %138
%140 = shl nuw nsw i32 %133, 5
%141 = zext i32 %140 to i64
%142 = getelementptr inbounds i8, ptr addrspace(1) %136, i64 %141
%143 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 1
%144 = load i64, ptr addrspace(1) %139, align 8, !tbaa !16
%145 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 1
%146 = load i64, ptr addrspace(1) %142, align 8, !tbaa !16
%147 = xor i64 %146, %144
%148 = load i64, ptr addrspace(1) %143, align 8, !tbaa !16
%149 = load i64, ptr addrspace(1) %145, align 8, !tbaa !16
%150 = xor i64 %149, %148
%151 = icmp ne i64 %147, 0
%152 = icmp ne i64 %150, 0
%153 = select i1 %151, i1 true, i1 %152
br i1 %153, label %154, label %201
154: ; preds = %124
%155 = getelementptr inbounds i64, ptr addrspace(1) %142, i64 2
%156 = load i64, ptr addrspace(1) %155, align 8, !tbaa !16
%157 = getelementptr inbounds i64, ptr addrspace(1) %139, i64 2
%158 = load i64, ptr addrspace(1) %157, align 8, !tbaa !16
%159 = and i64 %147, 983040
%160 = shl i64 %147, 4
%161 = and i64 %160, 61440
%162 = or i64 %161, %159
%163 = lshr i64 %147, 12
%164 = and i64 %163, 3840
%165 = or i64 %162, %164
%166 = and i64 %160, 240
%167 = or i64 %165, %166
%168 = and i64 %163, 15
%169 = or i64 %167, %168
%170 = trunc i64 %169 to i32
%171 = lshr i64 %169, 3
%172 = shl nuw nsw i32 %170, 2
%173 = and i32 %172, 28
%174 = getelementptr inbounds i32, ptr addrspace(1) %3, i64 %171
%175 = shl nuw nsw i32 1, %173
%176 = tail call i32 @_Z10atomic_addPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
%177 = lshr i32 %176, %173
%178 = and i32 %177, 15
%179 = icmp ugt i32 %178, 11
br i1 %179, label %180, label %182
180: ; preds = %154
%181 = tail call i32 @_Z10atomic_subPU3AS1Vjj(ptr addrspace(1) noundef %174, i32 noundef %175) #5
br label %201
182: ; preds = %154
%183 = xor i64 %158, %156
%184 = lshr i64 %183, 16
%185 = tail call i64 @llvm.fshl.i64(i64 %183, i64 %150, i64 48)
%186 = tail call i64 @llvm.fshl.i64(i64 %150, i64 %147, i64 48)
%187 = shl nuw nsw i32 %133, 6
%188 = shl i32 %130, 12
%189 = or i32 %187, %188
%190 = or i32 %189, %132
%191 = mul nuw nsw i64 %169, 384
%192 = and i64 %191, 4294967168
%193 = getelementptr inbounds i8, ptr addrspace(1) %1, i64 %192
%194 = shl nuw nsw i32 %178, 5
%195 = or disjoint i32 %194, 8
%196 = zext i32 %195 to i64
%197 = getelementptr inbounds i8, ptr addrspace(1) %193, i64 %196
%198 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 -4
store i32 %190, ptr addrspace(1) %198, align 4, !tbaa !11
store i64 %186, ptr addrspace(1) %197, align 8, !tbaa !16
%199 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 8
store i64 %185, ptr addrspace(1) %199, align 8, !tbaa !16
%200 = getelementptr inbounds i8, ptr addrspace(1) %197, i64 16
store i64 %184, ptr addrspace(1) %200, align 8, !tbaa !16
br label %201
201: ; preds = %182, %180, %124
%202 = tail call i64 @_Z14get_local_sizej(i32 noundef 0) #4
%203 = add i64 %202, %127
%204 = trunc i64 %203 to i32
%205 = icmp ugt i32 %120, %204
br i1 %205, label %124, label %206
206: ; preds = %201, %119
ret void
}
; Removed most of the if-else blocks
define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapture noundef readonly align 1 %.0, ptr addrspace(1) nocapture noundef writeonly align 1 %.1, ptr addrspace(1) nocapture noundef readonly align 4 %.2, ptr addrspace(1) noundef align 4 %.3, ptr addrspace(1) nocapture noundef readnone align 4 %.4) local_unnamed_addr #2 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 !kernel_arg_name !9 !reqd_work_group_size !10 {
; CHECK-LABEL: kernel_round1_short:
; CHECK: ; %bb.0: ; %.5
; CHECK-NEXT: s_add_u32 s10, s10, s15
; CHECK-NEXT: s_mov_b32 s32, 0
; CHECK-NEXT: s_addc_u32 s11, s11, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
; CHECK-NEXT: s_load_dwordx2 s[44:45], s[6:7], 0x10
; CHECK-NEXT: s_add_u32 s0, s0, s15
; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7]
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v40, v0
; CHECK-NEXT: s_add_u32 s42, s36, 40
; CHECK-NEXT: v_mov_b32_e32 v31, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9]
; CHECK-NEXT: s_addc_u32 s43, s37, 0
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b32 s33, s14
; CHECK-NEXT: s_mov_b32 s40, s13
; CHECK-NEXT: s_mov_b32 s41, s12
; CHECK-NEXT: s_mov_b64 s[38:39], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+12
; CHECK-NEXT: v_mov_b32_e32 v43, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z12get_local_idj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z12get_local_idj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[42:43]
; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0
; CHECK-NEXT: v_and_b32_e32 v1, 28, v1
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: global_load_dword v0, v0, s[44:45]
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z3minjj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z3minjj@rel32@hi+12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4
; CHECK-NEXT: v_mov_b32_e32 v1, 12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v41, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
; CHECK-NEXT: s_mov_b32 s42, 0
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
; CHECK-NEXT: .LBB1_1: ; %.37
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s43, s4, 1
; CHECK-NEXT: s_add_i32 s6, s4, 5
; CHECK-NEXT: v_or3_b32 v47, s5, v42, s43
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v46, v0
; CHECK-NEXT: v_mov_b32_e32 v56, s43
; CHECK-NEXT: s_mov_b32 s5, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
; CHECK-NEXT: ; %bb.2: ; %.53.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB1_3: ; %.53
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
; CHECK-NEXT: v_mov_b32_e32 v56, s8
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: v_mov_b32_e32 v47, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_mov_b32 s44, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_mov_b32 s45, 0
; CHECK-NEXT: s_inst_prefetch 0x1
; CHECK-NEXT: s_branch .LBB1_8
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s46
; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
; CHECK-NEXT: s_or_b32 s45, vcc_lo, s45
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s45
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s46, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00
; CHECK-NEXT: s_add_u32 s8, s36, 40
; CHECK-NEXT: s_addc_u32 s9, s37, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z10atomic_incPU3AS3Vj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z10atomic_incPU3AS3Vj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v47
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
; CHECK-NEXT: .LBB1_11: ; %Flow2
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s44
; CHECK-NEXT: ; %bb.12: ; %.32
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s43, v45
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
; CHECK-NEXT: s_or_b32 s42, s4, s42
; CHECK-NEXT: s_mov_b32 s4, s43
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
; CHECK-NEXT: ; %bb.13: ; %.119
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_add_u32 s8, s36, 40
; CHECK-NEXT: s_addc_u32 s9, s37, 0
; CHECK-NEXT: s_mov_b64 s[4:5], s[38:39]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s41
; CHECK-NEXT: s_mov_b32 s13, s40
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, _Z7barrierj@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, _Z7barrierj@rel32@hi+12
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: s_endpgm
.5:
%.6 = tail call i64 @_Z13get_global_idj(i32 noundef 0) #4
%.7 = trunc i64 %.6 to i32
%.8 = tail call i64 @_Z12get_local_idj(i32 noundef 0) #4
%.9 = trunc i64 %.8 to i32
%.10 = mul i32 %.9, 14
%.11 = getelementptr inbounds i8, ptr addrspace(3) @kernel_round1.first_words_data, i32 %.10
store i32 0, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
tail call void @_Z7barrierj(i32 noundef 1) #5
%.12 = lshr i64 %.6, 3
%.13 = shl i32 %.7, 2
%.14 = and i32 %.13, 28
%.15 = and i64 %.12, 536870911
%.16 = getelementptr inbounds i32, ptr addrspace(1) %.2, i64 %.15
%.17 = load i32, ptr addrspace(1) %.16, align 4, !tbaa !11
%.18 = lshr i32 %.17, %.14
%.19 = and i32 %.18, 15
%.20 = tail call i32 @_Z3minjj(i32 noundef %.19, i32 noundef 12) #4
%.21 = icmp eq i32 %.20, 0
%.23 = add i32 %.20, -1
%.24 = icmp eq i32 %.23, 0
store i8 0, ptr addrspace(3) %.11, align 1, !tbaa !15
br label %.37
.32: ; preds = %.114, %.48
%.33 = phi i32 [ %.50, %.48 ], [ %.115, %.114 ]
%.34 = icmp ult i32 %.44, %.23
%.35 = icmp ult i32 %.33, 60
%.36 = select i1 %.34, i1 %.35, i1 false
br i1 %.36, label %.37, label %.119
.37: ; preds = %.32, %.25
%.38 = phi i32 [ 0, %.5 ], [ %.44, %.32 ]
%.39 = phi i32 [ 0, %.5 ], [ %.33, %.32 ]
%.26 = shl i32 %.7, 10
%.40 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.38
%.41 = load i8, ptr addrspace(3) %.40, align 1, !tbaa !15
%.42 = shl i32 %.38, 5
%.43 = or i32 %.42, %.26
%.44 = add nuw i32 %.38, 1
%.45 = or i32 %.43, %.44
%.46 = add i32 %.38, 5
%.47 = icmp ult i32 %.46, %.20
br i1 %.47, label %.53, label %.48
.48: ; preds = %.98, %.37
%.49 = phi i32 [ %.45, %.37 ], [ %.100, %.98 ]
%.50 = phi i32 [ %.39, %.37 ], [ %.99, %.98 ]
%.51 = phi i32 [ %.44, %.37 ], [ %.54, %.98 ]
%.52 = icmp ult i32 %.51, %.20
br i1 %.52, label %.103, label %.32
.53: ; preds = %.37, %.98
%.54 = phi i32 [ %.101, %.98 ], [ %.46, %.37 ]
%.55 = phi i32 [ %.54, %.98 ], [ %.44, %.37 ]
%.56 = phi i32 [ %.99, %.98 ], [ %.39, %.37 ]
%.57 = phi i32 [ %.100, %.98 ], [ %.45, %.37 ]
%.58 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.55
%.59 = load i8, ptr addrspace(3) %.58, align 1, !tbaa !15
%.60 = icmp eq i8 %.41, %.59
br label %.98
.98: ; preds = %.93, %.87
%.99 = add i32 %.56, 1
%.100 = add i32 %.57, 4
%.101 = add i32 %.54, 4
%.102 = icmp ult i32 %.101, %.20
br i1 %.102, label %.53, label %.48
.103: ; preds = %.48, %.114
%.104 = phi i32 [ %.117, %.114 ], [ %.51, %.48 ]
%.105 = phi i32 [ %.115, %.114 ], [ %.50, %.48 ]
%.106 = phi i32 [ %.116, %.114 ], [ %.49, %.48 ]
%.107 = getelementptr inbounds i8, ptr addrspace(3) %.11, i32 %.104
%.108 = load i8, ptr addrspace(3) %.107, align 1, !tbaa !15
%.109 = icmp eq i8 %.41, %.108
br i1 %.109, label %.110, label %.114
.110: ; preds = %.103
%.111 = add i32 %.105, 1
%.112 = tail call i32 @_Z10atomic_incPU3AS3Vj(ptr addrspace(3) noundef @kernel_round1.collisionsNum) #5
%.113 = getelementptr inbounds i32, ptr addrspace(3) @kernel_round1.collisionsData, i32 %.112
store i32 %.106, ptr addrspace(3) %.113, align 4, !tbaa !11
br label %.114
.114: ; preds = %.110, %.103
%.115 = phi i32 [ %.111, %.110 ], [ %.105, %.103 ]
%.116 = add i32 %.106, 1
%.117 = add nuw i32 %.104, 1
%.118 = icmp ult i32 %.117, %.20
br i1 %.118, label %.103, label %.32
.119: ; preds = %.32, %.22, %.5
tail call void @_Z7barrierj(i32 noundef 1) #5
%.120 = load i32, ptr addrspace(3) @kernel_round1.collisionsNum, align 4, !tbaa !11
%.121 = icmp ugt i32 %.120, %.9
br label %.206
.206: ; preds = %.201, %.119
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.fshl.i64(i64, i64, i64) #3
attributes #0 = { convergent mustprogress nofree nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" }
attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="64,64" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1030" "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" "uniform-work-group-size"="true" }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { convergent nounwind willreturn memory(none) }
attributes #5 = { convergent nounwind }
!llvm.module.flags = !{!0, !1, !2}
!opencl.ocl.version = !{!3}
!llvm.ident = !{!4}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 1, i32 2}
!4 = !{!"clang version 17.0.0 (ssh://[email protected]:29418/lightning/ec/llvm-project 06ead8cf696777b9f17876b60707ba9de4d0606f)"}
!5 = !{i32 1, i32 1, i32 1, i32 1, i32 1}
!6 = !{!"none", !"none", !"none", !"none", !"none"}
!7 = !{!"char*", !"char*", !"uint*", !"uint*", !"uint*"}
!8 = !{!"", !"", !"", !"", !""}
!9 = !{!"ht_src", !"ht_dst", !"rowCountersSrc", !"rowCountersDst", !"debug"}
!10 = !{i32 64, i32 1, i32 1}
!11 = !{!12, !12, i64 0}
!12 = !{!"int", !13, i64 0}
!13 = !{!"omnipotent char", !14, i64 0}
!14 = !{!"Simple C/C++ TBAA"}
!15 = !{!13, !13, i64 0}
!16 = !{!17, !17, i64 0}
!17 = !{!"long", !13, i64 0}