llvm/llvm/test/CodeGen/AMDGPU/extract-subvector.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s

define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_2xi16:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB0_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_ushort v0, v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v1, v[2:3], s[8:11], 0 addr64 offset:2 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:4 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:6 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:8 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:10 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[2:3], s[8:11], 0 addr64 offset:12 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[8:11], 0 addr64 offset:14 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT:    v_or_b32_e32 v4, v0, v1
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB0_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB0_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 offset:2 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:4 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:6 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:10 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 offset:12 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:14 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GCN-NEXT:    v_or_b32_e32 v4, v2, v0
; GCN-NEXT:  .LBB0_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    v_ashrrev_i32_e32 v0, 16, v4
; GCN-NEXT:    v_bfe_i32 v1, v4, 0, 16
; GCN-NEXT:    v_mov_b32_e32 v2, 0xffff
; GCN-NEXT:    v_mov_b32_e32 v3, 0x8000
; GCN-NEXT:    v_mov_b32_e32 v4, 0xffff8000
; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <8 x i16>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <8 x i16>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
  %b2 = icmp sgt <2 x i16> %v2, <i16 -1, i16 -1>
  %r2 = select <2 x i1> %b2, <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> <i16 -1, i16 -1>
  ret <2 x i16> %r2
}

define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_2xi64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB1_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB1_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB1_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB1_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
; GCN-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[6:7]
; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v1, vcc
; GCN-NEXT:    v_mov_b32_e32 v1, -1
; GCN-NEXT:    v_mov_b32_e32 v3, -1
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <8 x i64> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <8 x i64> %m, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
  %b2 = icmp sgt <2 x i64> %v2, <i64 -1, i64 -1>
  %r2 = select <2 x i1> %b2, <2 x i64> <i64 -32768, i64 -32768>, <2 x i64> <i64 -1, i64 -1>
  ret <2 x i64> %r2
}

define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_4xi64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB2_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB2_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB2_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB2_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, vcc
; GCN-NEXT:    v_mov_b32_e32 v1, -1
; GCN-NEXT:    v_mov_b32_e32 v3, -1
; GCN-NEXT:    v_mov_b32_e32 v5, -1
; GCN-NEXT:    v_mov_b32_e32 v7, -1
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <8 x i64>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <8 x i64>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <8 x i64> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <8 x i64> %m, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b2 = icmp sgt <4 x i64> %v2, <i64 -1, i64 -1, i64 -1, i64 -1>
  %r2 = select <4 x i1> %b2, <4 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768>, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
  ret <4 x i64> %r2
}

define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_8xi64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB3_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB3_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB3_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB3_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v1, 0xffff8000
; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
; GCN-NEXT:    v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, -1, s[16:17]
; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, -1, vcc
; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, -1, s[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v6, v1, -1, s[6:7]
; GCN-NEXT:    v_cndmask_b32_e64 v8, v1, -1, s[8:9]
; GCN-NEXT:    v_cndmask_b32_e64 v10, v1, -1, s[10:11]
; GCN-NEXT:    v_cndmask_b32_e64 v12, v1, -1, s[12:13]
; GCN-NEXT:    v_cndmask_b32_e64 v14, v1, -1, s[14:15]
; GCN-NEXT:    v_mov_b32_e32 v1, -1
; GCN-NEXT:    v_mov_b32_e32 v3, -1
; GCN-NEXT:    v_mov_b32_e32 v5, -1
; GCN-NEXT:    v_mov_b32_e32 v7, -1
; GCN-NEXT:    v_mov_b32_e32 v9, -1
; GCN-NEXT:    v_mov_b32_e32 v11, -1
; GCN-NEXT:    v_mov_b32_e32 v13, -1
; GCN-NEXT:    v_mov_b32_e32 v15, -1
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <16 x i64>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <16 x i64>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <16 x i64> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <16 x i64> %m, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %b2 = icmp sgt <8 x i64> %v2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
  %r2 = select <8 x i1> %b2, <8 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
  ret <8 x i64> %r2
}

define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_2xf64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB4_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB4_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB4_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB4_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, -2.0, vcc
; GCN-NEXT:    v_cmp_lt_f64_e32 vcc, -1.0, v[6:7]
; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, -2.0, vcc
; GCN-NEXT:    v_mov_b32_e32 v0, 0
; GCN-NEXT:    v_mov_b32_e32 v2, 0
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <8 x double>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <8 x double>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <8 x double> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <8 x double> %m, <8 x double> undef, <2 x i32> <i32 0, i32 1>
  %b2 = fcmp ogt <2 x double> %v2, <double -1.0, double -1.0>
  %r2 = select <2 x i1> %b2, <2 x double> <double -2.0, double -2.0>, <2 x double> <double -1.0, double -1.0>
  ret <2 x double> %r2
}

define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_4xf64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB5_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB5_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB5_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB5_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e32 v1, -2.0, v0, vcc
; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[8:9]
; GCN-NEXT:    v_cndmask_b32_e32 v5, -2.0, v0, vcc
; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[10:11]
; GCN-NEXT:    v_cndmask_b32_e32 v7, -2.0, v0, vcc
; GCN-NEXT:    v_mov_b32_e32 v0, 0
; GCN-NEXT:    v_mov_b32_e32 v2, 0
; GCN-NEXT:    v_mov_b32_e32 v4, 0
; GCN-NEXT:    v_mov_b32_e32 v6, 0
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <8 x double>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <8 x double>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <8 x double> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <8 x double> %m, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b2 = fcmp ogt <4 x double> %v2, <double -1.0, double -1.0, double -1.0, double -1.0>
  %r2 = select <4 x i1> %b2, <4 x double> <double -2.0, double -2.0, double -2.0, double -2.0>, <4 x double> <double -1.0, double -1.0, double -1.0, double -1.0>
  ret <4 x double> %r2
}

define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %c0) {
; GCN-LABEL: extract_8xf64:
; GCN:       ; %bb.0:
; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
; GCN-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35
; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
; GCN-NEXT:    s_cbranch_execz .LBB6_2
; GCN-NEXT:  ; %bb.1: ; %F
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:112 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:96 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:80 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 offset:64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[2:3], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[2:3], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[2:3], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[2:3], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    ; implicit-def: $vgpr0
; GCN-NEXT:  .LBB6_2: ; %Flow
; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
; GCN-NEXT:    s_cbranch_execz .LBB6_4
; GCN-NEXT:  ; %bb.3: ; %T
; GCN-NEXT:    s_mov_b32 s10, 0
; GCN-NEXT:    s_mov_b32 s11, 0xf000
; GCN-NEXT:    s_mov_b32 s8, s10
; GCN-NEXT:    s_mov_b32 s9, s10
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:112 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:96 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:80 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 offset:64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:16 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:32 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    buffer_load_dwordx4 v[16:19], v[0:1], s[8:11], 0 addr64 offset:48 glc
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:  .LBB6_4: ; %exit
; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
; GCN-NEXT:    v_mov_b32_e32 v0, 0xbff00000
; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc, -1.0, v[6:7]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[4:5], -1.0, v[8:9]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[6:7], -1.0, v[10:11]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[8:9], -1.0, v[12:13]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[10:11], -1.0, v[14:15]
; GCN-NEXT:    s_waitcnt vmcnt(0)
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[12:13], -1.0, v[16:17]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[14:15], -1.0, v[18:19]
; GCN-NEXT:    v_cmp_nlt_f64_e64 s[16:17], -1.0, v[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v1, -2.0, v0, s[16:17]
; GCN-NEXT:    v_cndmask_b32_e32 v3, -2.0, v0, vcc
; GCN-NEXT:    v_cndmask_b32_e64 v5, -2.0, v0, s[4:5]
; GCN-NEXT:    v_cndmask_b32_e64 v7, -2.0, v0, s[6:7]
; GCN-NEXT:    v_cndmask_b32_e64 v9, -2.0, v0, s[8:9]
; GCN-NEXT:    v_cndmask_b32_e64 v11, -2.0, v0, s[10:11]
; GCN-NEXT:    v_cndmask_b32_e64 v13, -2.0, v0, s[12:13]
; GCN-NEXT:    v_cndmask_b32_e64 v15, -2.0, v0, s[14:15]
; GCN-NEXT:    v_mov_b32_e32 v0, 0
; GCN-NEXT:    v_mov_b32_e32 v2, 0
; GCN-NEXT:    v_mov_b32_e32 v4, 0
; GCN-NEXT:    v_mov_b32_e32 v6, 0
; GCN-NEXT:    v_mov_b32_e32 v8, 0
; GCN-NEXT:    v_mov_b32_e32 v10, 0
; GCN-NEXT:    v_mov_b32_e32 v12, 0
; GCN-NEXT:    v_mov_b32_e32 v14, 0
; GCN-NEXT:    s_setpc_b64 s[30:31]
  br i1 %c0, label %T, label %F

T:
  %t = load volatile <16 x double>, ptr addrspace(1) %p0
  br label %exit

F:
  %f = load volatile <16 x double>, ptr addrspace(1) %p1
  br label %exit

exit:
  %m = phi <16 x double> [ %t, %T ], [ %f, %F ]
  %v2 = shufflevector <16 x double> %m, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %b2 = fcmp ogt <8 x double> %v2, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
  %r2 = select <8 x i1> %b2, <8 x double> <double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0, double -2.0>, <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
  ret <8 x double> %r2
}