llvm/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s

define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v3i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v9, v[6:7]
; GFX8-NEXT:    flat_load_ushort v10, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_ushort v11, v[2:3]
; GFX8-NEXT:    flat_load_ushort v12, v[0:1]
; GFX8-NEXT:    flat_load_ushort v6, v[6:7]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v7, v8, v11
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u16_e32 v8, v9, v12
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v10, v6
; GFX8-NEXT:    flat_store_short v[4:5], v7
; GFX8-NEXT:    flat_store_short v[0:1], v8
; GFX8-NEXT:    flat_store_short v[2:3], v6
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v3i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT:    global_load_ushort v8, v[2:3], off
; GFX9-NEXT:    global_load_ushort v9, v[2:3], off offset:4
; GFX9-NEXT:    global_load_ushort v10, v[0:1], off offset:2
; GFX9-NEXT:    global_load_ushort v11, v[2:3], off offset:2
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v8
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_pk_add_u16 v2, v7, v9
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshl_or_b32 v0, v10, 16, v0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
; GFX9-NEXT:    global_store_short v[4:5], v0, off
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT:    global_store_short v[4:5], v2, off offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <3 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <3 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <3 x i16> %a, %b
  store <3 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <3 x i16> @add_v3i16_arg(<3 x i16> %a, <3 x i16> %b) {
; GFX8-LABEL: add_v3i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v4, v0, v2
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
; GFX8-NEXT:    v_add_u16_e32 v1, v1, v3
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v3i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <3 x i16> %a, %b
  ret <3 x i16> %add
}

define void @add_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v4i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v0, v2
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v2, v1, v3
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v4i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v6, v8
; GFX9-NEXT:    v_pk_add_u16 v1, v7, v9
; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <4 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <4 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <4 x i16> %a, %b
  store <4 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <4 x i16> @add_v4i16_arg(<4 x i16> %a, <4 x i16> %b) {
; GFX8-LABEL: add_v4i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v4, v0, v2
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v2, v1, v3
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v4i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <4 x i16> %a, %b
  ret <4 x i16> %add
}

define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v5i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v0
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v12, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v13, v[6:7]
; GFX8-NEXT:    flat_load_ushort v14, v[8:9]
; GFX8-NEXT:    flat_load_ushort v15, v[10:11]
; GFX8-NEXT:    flat_load_ushort v16, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 6, v2
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 8, v2
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_ushort v17, v[2:3]
; GFX8-NEXT:    flat_load_ushort v18, v[0:1]
; GFX8-NEXT:    flat_load_ushort v19, v[6:7]
; GFX8-NEXT:    flat_load_ushort v20, v[8:9]
; GFX8-NEXT:    flat_load_ushort v10, v[10:11]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 6, v4
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 8, v4
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(4)
; GFX8-NEXT:    v_add_u16_e32 v11, v12, v17
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u16_e32 v12, v13, v18
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v13, v14, v19
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u16_e32 v14, v15, v20
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v10, v16, v10
; GFX8-NEXT:    flat_store_short v[4:5], v11
; GFX8-NEXT:    flat_store_short v[0:1], v12
; GFX8-NEXT:    flat_store_short v[2:3], v13
; GFX8-NEXT:    flat_store_short v[6:7], v14
; GFX8-NEXT:    flat_store_short v[8:9], v10
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v5i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
; GFX9-NEXT:    global_load_ushort v9, v[2:3], off
; GFX9-NEXT:    global_load_ushort v10, v[2:3], off offset:4
; GFX9-NEXT:    global_load_ushort v11, v[2:3], off offset:8
; GFX9-NEXT:    global_load_ushort v12, v[0:1], off offset:2
; GFX9-NEXT:    global_load_ushort v13, v[0:1], off offset:6
; GFX9-NEXT:    global_load_ushort v14, v[2:3], off offset:2
; GFX9-NEXT:    global_load_ushort v15, v[2:3], off offset:6
; GFX9-NEXT:    s_waitcnt vmcnt(9)
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v7
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v9
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v10
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_pk_add_u16 v6, v8, v11
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_lshl_or_b32 v0, v12, 16, v0
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_lshl_or_b32 v1, v13, 16, v1
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshl_or_b32 v2, v14, 16, v2
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshl_or_b32 v3, v15, 16, v3
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
; GFX9-NEXT:    global_store_short v[4:5], v0, off
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT:    global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v1, off offset:6
; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:8
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <5 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <5 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <5 x i16> %a, %b
  store <5 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) {
; GFX8-LABEL: add_v5i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v0, v3
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v3, v1, v4
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
; GFX8-NEXT:    v_add_u16_e32 v2, v2, v5
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v5i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <5 x i16> %a, %b
  ret <5 x i16> %add
}

define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v6i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[2:3]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v3, v6, v0
; GFX8-NEXT:    v_add_u16_sdwa v0, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v6, v7, v1
; GFX8-NEXT:    v_add_u16_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v7, v8, v2
; GFX8-NEXT:    v_add_u16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
; GFX8-NEXT:    flat_store_dwordx3 v[4:5], v[0:2]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v6i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
; GFX9-NEXT:    global_load_dwordx3 v[9:11], v[2:3], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v6, v9
; GFX9-NEXT:    v_pk_add_u16 v1, v7, v10
; GFX9-NEXT:    v_pk_add_u16 v2, v8, v11
; GFX9-NEXT:    global_store_dwordx3 v[4:5], v[0:2], off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <6 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <6 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <6 x i16> %a, %b
  store <6 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) {
; GFX8-LABEL: add_v6i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v0, v3
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v3, v1, v4
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
; GFX8-NEXT:    v_add_u16_e32 v3, v2, v5
; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v6i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <6 x i16> %a, %b
  ret <6 x i16> %add
}

define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: addv_7i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 6, v0
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 8, v0
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 10, v0
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v16, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v17, v[6:7]
; GFX8-NEXT:    flat_load_ushort v18, v[8:9]
; GFX8-NEXT:    flat_load_ushort v19, v[10:11]
; GFX8-NEXT:    flat_load_ushort v20, v[12:13]
; GFX8-NEXT:    flat_load_ushort v21, v[14:15]
; GFX8-NEXT:    flat_load_ushort v22, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 6, v2
; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 8, v2
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 10, v2
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 12, v2
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
; GFX8-NEXT:    flat_load_ushort v6, v[6:7]
; GFX8-NEXT:    flat_load_ushort v7, v[8:9]
; GFX8-NEXT:    flat_load_ushort v8, v[10:11]
; GFX8-NEXT:    flat_load_ushort v9, v[12:13]
; GFX8-NEXT:    flat_load_ushort v10, v[14:15]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v2, v16, v2
; GFX8-NEXT:    s_waitcnt vmcnt(5)
; GFX8-NEXT:    v_add_u16_e32 v3, v17, v3
; GFX8-NEXT:    flat_store_short v[4:5], v2
; GFX8-NEXT:    flat_store_short v[0:1], v3
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v6, v18, v6
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_short v[0:1], v6
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v7, v19, v7
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_short v[0:1], v7
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v8, v20, v8
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_short v[0:1], v8
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v9, v21, v9
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_short v[0:1], v9
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 12, v4
; GFX8-NEXT:    s_waitcnt vmcnt(6)
; GFX8-NEXT:    v_add_u16_e32 v10, v22, v10
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_short v[0:1], v10
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: addv_7i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
; GFX9-NEXT:    global_load_ushort v9, v[0:1], off offset:12
; GFX9-NEXT:    global_load_ushort v10, v[2:3], off
; GFX9-NEXT:    global_load_ushort v11, v[2:3], off offset:4
; GFX9-NEXT:    global_load_ushort v12, v[2:3], off offset:8
; GFX9-NEXT:    global_load_ushort v13, v[2:3], off offset:12
; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:2
; GFX9-NEXT:    global_load_ushort v15, v[0:1], off offset:6
; GFX9-NEXT:    global_load_ushort v16, v[0:1], off offset:10
; GFX9-NEXT:    global_load_ushort v17, v[2:3], off offset:2
; GFX9-NEXT:    global_load_ushort v18, v[2:3], off offset:6
; GFX9-NEXT:    global_load_ushort v19, v[2:3], off offset:10
; GFX9-NEXT:    s_waitcnt vmcnt(13)
; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT:    s_waitcnt vmcnt(12)
; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v7
; GFX9-NEXT:    s_waitcnt vmcnt(11)
; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v8
; GFX9-NEXT:    s_waitcnt vmcnt(9)
; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v10
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff, v11
; GFX9-NEXT:    s_waitcnt vmcnt(7)
; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v12
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_pk_add_u16 v8, v9, v13
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_lshl_or_b32 v0, v14, 16, v0
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_lshl_or_b32 v1, v15, 16, v1
; GFX9-NEXT:    s_waitcnt vmcnt(3)
; GFX9-NEXT:    v_lshl_or_b32 v2, v16, 16, v2
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_lshl_or_b32 v3, v17, 16, v3
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshl_or_b32 v6, v18, 16, v6
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshl_or_b32 v7, v19, 16, v7
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v6
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v7
; GFX9-NEXT:    global_store_short v[4:5], v0, off
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT:    global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v1, off offset:6
; GFX9-NEXT:    global_store_short v[4:5], v2, off offset:8
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v2, off offset:10
; GFX9-NEXT:    global_store_short v[4:5], v8, off offset:12
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <7 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <7 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <7 x i16> %a, %b
  store <7 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) {
; GFX8-LABEL: add_v7i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v8, v0, v4
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v4, v1, v5
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
; GFX8-NEXT:    v_add_u16_e32 v4, v2, v6
; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v7i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v5
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v6
; GFX9-NEXT:    v_pk_add_u16 v3, v3, v7
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <7 x i16> %a, %b
  ret <7 x i16> %add
}

define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v9i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v16, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 16, v4
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v1, v6, v10
; GFX8-NEXT:    v_add_u16_sdwa v2, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v3, v7, v11
; GFX8-NEXT:    v_add_u16_sdwa v6, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v7, v8, v12
; GFX8-NEXT:    v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v10, v9, v13
; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v11, v16, v0
; GFX8-NEXT:    v_or_b32_e32 v0, v1, v2
; GFX8-NEXT:    v_or_b32_e32 v1, v3, v6
; GFX8-NEXT:    v_or_b32_e32 v2, v7, v8
; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT:    flat_store_short v[14:15], v11
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v9i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT:    global_load_ushort v15, v[2:3], off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_pk_add_u16 v0, v10, v6
; GFX9-NEXT:    v_pk_add_u16 v1, v11, v7
; GFX9-NEXT:    v_pk_add_u16 v2, v12, v8
; GFX9-NEXT:    v_pk_add_u16 v3, v13, v9
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v6, v14, v15
; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <9 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <9 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <9 x i16> %a, %b
  store <9 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) {
; GFX8-LABEL: add_v9i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v10, v0, v5
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v5, v1, v6
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v1, v5, v1
; GFX8-NEXT:    v_add_u16_e32 v5, v2, v7
; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
; GFX8-NEXT:    v_add_u16_e32 v5, v3, v8
; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
; GFX8-NEXT:    v_add_u16_e32 v4, v4, v9
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v9i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v5
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v6
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v7
; GFX9-NEXT:    v_pk_add_u16 v3, v3, v8
; GFX9-NEXT:    v_pk_add_u16 v4, v4, v9
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <9 x i16> %a, %b
  ret <9 x i16> %add
}

define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v10i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dword v14, v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_dword v15, v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v0, v6, v10
; GFX8-NEXT:    v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v2, v7, v11
; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v6, v8, v12
; GFX8-NEXT:    v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v8, v9, v13
; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX8-NEXT:    v_or_b32_e32 v2, v6, v7
; GFX8-NEXT:    v_or_b32_e32 v3, v8, v9
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v14, v15
; GFX8-NEXT:    v_add_u16_sdwa v7, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_dword v[0:1], v6
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v10i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT:    global_load_dword v14, v[0:1], off offset:16
; GFX9-NEXT:    global_load_dword v15, v[2:3], off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_pk_add_u16 v0, v10, v6
; GFX9-NEXT:    v_pk_add_u16 v1, v11, v7
; GFX9-NEXT:    v_pk_add_u16 v2, v12, v8
; GFX9-NEXT:    v_pk_add_u16 v3, v13, v9
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v6, v14, v15
; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT:    global_store_dword v[4:5], v6, off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <10 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <10 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <10 x i16> %a, %b
  store <10 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v11i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 16, v0
; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 18, v0
; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_ushort v18, v[10:11]
; GFX8-NEXT:    flat_load_ushort v19, v[12:13]
; GFX8-NEXT:    flat_load_ushort v20, v[0:1]
; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 18, v2
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v2
; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
; GFX8-NEXT:    flat_load_ushort v1, v[14:15]
; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 16, v4
; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, 0, v5, vcc
; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 18, v4
; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(3)
; GFX8-NEXT:    v_add_u16_e32 v3, v6, v10
; GFX8-NEXT:    v_add_u16_sdwa v10, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v21, v7, v11
; GFX8-NEXT:    v_add_u16_sdwa v11, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v22, v8, v12
; GFX8-NEXT:    v_add_u16_sdwa v8, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v12, v9, v13
; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v4
; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v13, v18, v0
; GFX8-NEXT:    s_waitcnt vmcnt(1)
; GFX8-NEXT:    v_add_u16_e32 v18, v19, v1
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v19, v20, v2
; GFX8-NEXT:    v_or_b32_e32 v0, v3, v10
; GFX8-NEXT:    v_or_b32_e32 v1, v21, v11
; GFX8-NEXT:    v_or_b32_e32 v2, v22, v8
; GFX8-NEXT:    v_or_b32_e32 v3, v12, v9
; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT:    flat_store_short v[14:15], v13
; GFX8-NEXT:    flat_store_short v[16:17], v18
; GFX8-NEXT:    flat_store_short v[6:7], v19
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v11i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT:    global_load_ushort v15, v[2:3], off offset:16
; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT:    global_load_ushort v16, v[0:1], off offset:20
; GFX9-NEXT:    global_load_ushort v17, v[2:3], off offset:20
; GFX9-NEXT:    global_load_ushort v18, v[0:1], off offset:18
; GFX9-NEXT:    global_load_ushort v19, v[2:3], off offset:18
; GFX9-NEXT:    s_waitcnt vmcnt(6)
; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
; GFX9-NEXT:    s_waitcnt vmcnt(5)
; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT:    s_waitcnt vmcnt(4)
; GFX9-NEXT:    v_pk_add_u16 v0, v6, v10
; GFX9-NEXT:    v_pk_add_u16 v1, v7, v11
; GFX9-NEXT:    v_pk_add_u16 v2, v8, v12
; GFX9-NEXT:    v_pk_add_u16 v3, v9, v13
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshl_or_b32 v7, v18, 16, v14
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshl_or_b32 v8, v19, 16, v15
; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT:    v_pk_add_u16 v6, v16, v17
; GFX9-NEXT:    v_pk_add_u16 v0, v7, v8
; GFX9-NEXT:    global_store_short v[4:5], v0, off offset:16
; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:18
; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:20
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <11 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <11 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <11 x i16> %a, %b
  store <11 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) {
; GFX8-LABEL: add_v11i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v12, v0, v6
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v6, v1, v7
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
; GFX8-NEXT:    v_add_u16_e32 v6, v2, v8
; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
; GFX8-NEXT:    v_add_u16_e32 v6, v3, v9
; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
; GFX8-NEXT:    v_add_u16_e32 v6, v4, v10
; GFX8-NEXT:    v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
; GFX8-NEXT:    v_add_u16_e32 v5, v5, v11
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v11i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v6
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v7
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v8
; GFX9-NEXT:    v_pk_add_u16 v3, v3, v9
; GFX9-NEXT:    v_pk_add_u16 v4, v4, v10
; GFX9-NEXT:    v_pk_add_u16 v5, v5, v11
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <11 x i16> %a, %b
  ret <11 x i16> %add
}

define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v12i16:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[0:1]
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[0:1]
; GFX8-NEXT:    s_waitcnt vmcnt(2)
; GFX8-NEXT:    v_add_u16_e32 v0, v6, v10
; GFX8-NEXT:    v_add_u16_sdwa v1, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v2, v7, v11
; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v6, v8, v12
; GFX8-NEXT:    v_add_u16_sdwa v7, v8, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v8, v9, v13
; GFX8-NEXT:    v_add_u16_sdwa v9, v9, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX8-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX8-NEXT:    v_or_b32_e32 v2, v6, v7
; GFX8-NEXT:    v_or_b32_e32 v3, v8, v9
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v6, v14, v16
; GFX8-NEXT:    v_add_u16_sdwa v7, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v8, v15, v17
; GFX8-NEXT:    v_add_u16_sdwa v9, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
; GFX8-NEXT:    v_or_b32_e32 v7, v8, v9
; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
; GFX8-NEXT:    s_waitcnt vmcnt(0)
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v12i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:16
; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[2:3], off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    v_pk_add_u16 v0, v10, v6
; GFX9-NEXT:    v_pk_add_u16 v1, v11, v7
; GFX9-NEXT:    v_pk_add_u16 v2, v12, v8
; GFX9-NEXT:    v_pk_add_u16 v3, v13, v9
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v6, v14, v16
; GFX9-NEXT:    v_pk_add_u16 v7, v15, v17
; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off offset:16
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %a = load <12 x i16>, ptr addrspace(1) %ptra, align 4
  %b = load <12 x i16>, ptr addrspace(1) %ptrb, align 4
  %add = add <12 x i16> %a, %b
  store <12 x i16> %add, ptr addrspace(1) %ptr2, align 4
  ret void
}

define <12 x i16> @add_v12i16_arg(<12 x i16> %a, <12 x i16> %b) {
; GFX8-LABEL: add_v12i16_arg:
; GFX8:       ; %bb.0:
; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT:    v_add_u16_e32 v12, v0, v6
; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_add_u16_e32 v6, v1, v7
; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
; GFX8-NEXT:    v_add_u16_e32 v6, v2, v8
; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v2, v6, v2
; GFX8-NEXT:    v_add_u16_e32 v6, v3, v9
; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
; GFX8-NEXT:    v_add_u16_e32 v6, v4, v10
; GFX8-NEXT:    v_add_u16_sdwa v4, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
; GFX8-NEXT:    v_add_u16_e32 v6, v5, v11
; GFX8-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
; GFX8-NEXT:    s_setpc_b64 s[30:31]
;
; GFX9-LABEL: add_v12i16_arg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_pk_add_u16 v0, v0, v6
; GFX9-NEXT:    v_pk_add_u16 v1, v1, v7
; GFX9-NEXT:    v_pk_add_u16 v2, v2, v8
; GFX9-NEXT:    v_pk_add_u16 v3, v3, v9
; GFX9-NEXT:    v_pk_add_u16 v4, v4, v10
; GFX9-NEXT:    v_pk_add_u16 v5, v5, v11
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %add = add <12 x i16> %a, %b
  ret <12 x i16> %add
}