llvm/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s -verify-machineinstrs | FileCheck %s

target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"

; s8 -> f16
; No widening
define void @s8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r7 = #1
; CHECK-NEXT:     r6 = #64
; CHECK-NEXT:     v1:0.h = vunpack(v0.b)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vsplat(r7)
; CHECK-NEXT:     r3:2 = combine(#31,#5)
; CHECK-NEXT:     v3.h = vabs(v0.h)
; CHECK-NEXT:     v4.h = vabs(v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.h = vsplat(r6)
; CHECK-NEXT:     v7.h = vsplat(r3)
; CHECK-NEXT:     v9 = vxor(v9,v9)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r5 = ##32768
; CHECK-NEXT:     v5.uh = vcl0(v3.uh)
; CHECK-NEXT:     q0 = vcmp.gt(v9.h,v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10.h = vsplat(r5)
; CHECK-NEXT:     r4 = #10
; CHECK-NEXT:     v6.uh = vcl0(v4.uh)
; CHECK-NEXT:     v5.h = vadd(v5.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27 = vmux(q0,v10,v9)
; CHECK-NEXT:     v6.h = vadd(v6.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vasl(v3.h,v5.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.h = vasl(v4.h,v6.h)
; CHECK-NEXT:     v13 = vand(v3,v8)
; CHECK-NEXT:     v11.h = vadd(v3.h,v7.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.h = vadd(v4.h,v7.h)
; CHECK-NEXT:     q2 = vcmp.eq(v13.h,v9.h)
; CHECK-NEXT:     v8 = vand(v4,v8)
; CHECK-NEXT:     q1 = vcmp.gt(v3.uh,v11.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.uh = vlsr(v11.uh,r2)
; CHECK-NEXT:     v13 = vmux(q2,v9,v2)
; CHECK-NEXT:     q2 = vcmp.eq(v8.h,v9.h)
; CHECK-NEXT:     q3 = vcmp.gt(v4.uh,v14.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v20.uh = vlsr(v14.uh,r2)
; CHECK-NEXT:     v22 = vmux(q2,v9,v2)
; CHECK-NEXT:     v21 = vmux(q1,v2,v9)
; CHECK-NEXT:     v2 = vmux(q3,v2,v9)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v19.uh = vlsr(v4.uh,r2)
; CHECK-NEXT:     v13.h = vadd(v11.h,v13.h)
; CHECK-NEXT:     v24.h = vadd(v20.h,v22.h)
; CHECK-NEXT:     v2.h = vadd(v2.h,v7.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uh = vlsr(v3.uh,r2)
; CHECK-NEXT:     v23.h = vadd(v21.h,v7.h)
; CHECK-NEXT:     v2.h = vsub(v2.h,v6.h)
; CHECK-NEXT:     q3 = vcmp.gt(v9.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.uh = vlsr(v11.uh,r7)
; CHECK-NEXT:     v3.h = vsub(v23.h,v5.h)
; CHECK-NEXT:     q2 = vcmp.eq(v12.h,v11.h)
; CHECK-NEXT:     q1 = vcmp.eq(v19.h,v20.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uh = vlsr(v13.uh,r7)
; CHECK-NEXT:     v28 = vmux(q3,v10,v9)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v9.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uh = vlsr(v24.uh,r7)
; CHECK-NEXT:     v5 = vmux(q2,v25,v11)
; CHECK-NEXT:     q2 = vcmp.eq(v1.h,v9.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uh = vlsr(v20.uh,r7)
; CHECK-NEXT:     v5 = vor(v27,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vasl(v3.h,r4)
; CHECK-NEXT:     v4 = vmux(q1,v26,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vasl(v2.h,r4)
; CHECK-NEXT:     v4 = vor(v28,v4)
; CHECK-NEXT:     v29 = vor(v5,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vor(v4,v2)
; CHECK-NEXT:     v31 = vmux(q3,v9,v29)
; CHECK-NEXT:     vmem(r1+#0) = v31.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v9,v2)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v30.new
; CHECK-NEXT:    }
  %v0 = load <128 x i8>, ptr %a0, align 128
  %v1 = sitofp <128 x i8> %v0 to <128 x half>
  store <128 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen input
define void @s8f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r3:2 = combine(#64,#31)
; CHECK-NEXT:     v1:0.h = vunpack(v0.b)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vsplat(r6)
; CHECK-NEXT:     v4.h = vsplat(r2)
; CHECK-NEXT:     v2.h = vabs(v0.h)
; CHECK-NEXT:     v1 = vxor(v1,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vsplat(r3)
; CHECK-NEXT:     r5:4 = combine(##32768,#5)
; CHECK-NEXT:     r2 = #10
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.h = vsplat(r5)
; CHECK-NEXT:     v5.uh = vcl0(v2.uh)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vasl(v2.h,v5.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v2.h,v4.h)
; CHECK-NEXT:     v6 = vand(v2,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.uh = vlsr(v2.uh,r4)
; CHECK-NEXT:     q0 = vcmp.eq(v6.h,v1.h)
; CHECK-NEXT:     q1 = vcmp.gt(v2.uh,v7.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uh = vlsr(v7.uh,r4)
; CHECK-NEXT:     v26 = vmux(q0,v1,v3)
; CHECK-NEXT:     v3 = vmux(q1,v3,v1)
; CHECK-NEXT:     q1 = vcmp.gt(v1.h,v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT:     v3.h = vadd(v3.h,v4.h)
; CHECK-NEXT:     q2 = vcmp.eq(v2.h,v25.h)
; CHECK-NEXT:     v30 = vmux(q1,v8,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uh = vlsr(v25.uh,r6)
; CHECK-NEXT:     v28.h = vsub(v3.h,v5.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v7.uh,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vasl(v28.h,r2)
; CHECK-NEXT:     v3 = vmux(q2,v29,v27)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v30,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v1,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i8>, ptr %a0, align 128
  %v1 = sitofp <64 x i8> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}


; s8 -> f32
; No widening
define void @s8f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r7 = #64
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r0)
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     v1 = valign(v0,v0,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r3)
; CHECK-NEXT:     r7 = #512
; CHECK-NEXT:     v9:8.h = vunpack(v0.b)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4 = vsplat(r7)
; CHECK-NEXT:     r6 = ##-2147483648
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v7:6.h = vunpack(v1.b)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8 = vsplat(r6)
; CHECK-NEXT:     v1:0.w = vunpack(v8.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7:6.w = vunpack(v6.h)
; CHECK-NEXT:     v5.w = vabs(v0.w)
; CHECK-NEXT:     v10.w = vabs(v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.w = vabs(v6.w)
; CHECK-NEXT:     v13.w = vabs(v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.uw = vcl0(v5.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uw = vcl0(v26.uw)
; CHECK-NEXT:     v9.w = vadd(v9.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uw = vcl0(v13.uw)
; CHECK-NEXT:     v15.w = vadd(v12.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.uw = vcl0(v10.uw)
; CHECK-NEXT:     v12.w = vadd(v14.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.w = vasl(v26.w,v15.w)
; CHECK-NEXT:     v11.w = vadd(v11.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13.w = vasl(v13.w,v12.w)
; CHECK-NEXT:     v20 = vand(v27,v4)
; CHECK-NEXT:     v19.w = vadd(v27.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v16.w = vasl(v5.w,v9.w)
; CHECK-NEXT:     v5 = vxor(v5,v5)
; CHECK-NEXT:     v23.w = vadd(v13.w,v3.w)
; CHECK-NEXT:     v28 = vand(v13,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v17.w = vasl(v10.w,v11.w)
; CHECK-NEXT:     q3 = vcmp.eq(v20.w,v5.w)
; CHECK-NEXT:     q2 = vcmp.gt(v27.uw,v19.uw)
; CHECK-NEXT:     q0 = vcmp.gt(v5.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v27.uw,r2)
; CHECK-NEXT:     v30 = vmux(q3,v5,v2)
; CHECK-NEXT:     q3 = vcmp.eq(v28.w,v5.w)
; CHECK-NEXT:     v22 = vand(v17,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uw = vlsr(v19.uw,r2)
; CHECK-NEXT:     v27 = vmux(q3,v5,v2)
; CHECK-NEXT:     q1 = vcmp.eq(v22.w,v5.w)
; CHECK-NEXT:     v24 = vmux(q2,v2,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31.uw = vlsr(v23.uw,r2)
; CHECK-NEXT:     v22.w = vadd(v14.w,v30.w)
; CHECK-NEXT:     v30.w = vadd(v17.w,v3.w)
; CHECK-NEXT:     q2 = vcmp.eq(v21.w,v14.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uw = vlsr(v13.uw,r2)
; CHECK-NEXT:     v28.w = vadd(v31.w,v27.w)
; CHECK-NEXT:     v3.w = vadd(v16.w,v3.w)
; CHECK-NEXT:     v4 = vand(v16,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uw = vlsr(v14.uw,r0)
; CHECK-NEXT:     q3 = vcmp.eq(v29.w,v31.w)
; CHECK-NEXT:     v18 = vmux(q0,v8,v5)
; CHECK-NEXT:     q0 = vcmp.gt(v5.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v19.uw = vlsr(v31.uw,r0)
; CHECK-NEXT:     v26 = vmux(q1,v5,v2)
; CHECK-NEXT:     v31 = vmux(q0,v8,v5)
; CHECK-NEXT:     q0 = vcmp.gt(v16.uw,v3.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10 = vsplat(r5)
; CHECK-NEXT:     v29.uw = vlsr(v22.uw,r0)
; CHECK-NEXT:     v15.w = vsub(v24.w,v15.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v20.uw = vlsr(v28.uw,r0)
; CHECK-NEXT:     v14 = vmux(q2,v29,v14)
; CHECK-NEXT:     q2 = vcmp.gt(v13.uw,v23.uw)
; CHECK-NEXT:     v15.w = vadd(v15.w,v10.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uw = vlsr(v30.uw,r2)
; CHECK-NEXT:     v19 = vmux(q3,v20,v19)
; CHECK-NEXT:     q3 = vcmp.eq(v4.w,v5.w)
; CHECK-NEXT:     v27 = vmux(q2,v2,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.uw = vlsr(v3.uw,r2)
; CHECK-NEXT:     q2 = vcmp.gt(v17.uw,v30.uw)
; CHECK-NEXT:     v28.w = vadd(v25.w,v26.w)
; CHECK-NEXT:     v29 = vmux(q3,v5,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v17.uw = vlsr(v17.uw,r2)
; CHECK-NEXT:     v19 = vor(v31,v19)
; CHECK-NEXT:     v31 = vmux(q2,v2,v5)
; CHECK-NEXT:     v2 = vmux(q0,v2,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.uw = vlsr(v16.uw,r2)
; CHECK-NEXT:     v30.w = vadd(v3.w,v29.w)
; CHECK-NEXT:     v2.w = vsub(v2.w,v9.w)
; CHECK-NEXT:     v11.w = vsub(v31.w,v11.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v16.uw = vlsr(v28.uw,r0)
; CHECK-NEXT:     q3 = vcmp.eq(v17.w,v25.w)
; CHECK-NEXT:     v4.w = vsub(v27.w,v12.w)
; CHECK-NEXT:     v2.w = vadd(v2.w,v10.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13.uw = vlsr(v25.uw,r0)
; CHECK-NEXT:     q0 = vcmp.eq(v24.w,v3.w)
; CHECK-NEXT:     v21.w = vadd(v11.w,v10.w)
; CHECK-NEXT:     q2 = vcmp.gt(v5.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v22.uw = vlsr(v30.uw,r0)
; CHECK-NEXT:     v23 = vmux(q3,v16,v13)
; CHECK-NEXT:     q3 = vcmp.gt(v5.w,v0.w)
; CHECK-NEXT:     v24 = vmux(q2,v8,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.uw = vlsr(v3.uw,r0)
; CHECK-NEXT:     v4.w = vadd(v4.w,v10.w)
; CHECK-NEXT:     v8 = vmux(q3,v8,v5)
; CHECK-NEXT:     v10 = vor(v24,v23)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.w = vasl(v21.w,r4)
; CHECK-NEXT:     v3 = vmux(q0,v22,v3)
; CHECK-NEXT:     v14 = vor(v18,v14)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r4)
; CHECK-NEXT:     v3 = vor(v8,v3)
; CHECK-NEXT:     v25 = vor(v10,v9)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v5.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v15.w = vasl(v15.w,r4)
; CHECK-NEXT:     v2 = vor(v3,v2)
; CHECK-NEXT:     v27 = vmux(q2,v5,v25)
; CHECK-NEXT:     vmem(r1+#1) = v27.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.w = vasl(v4.w,r4)
; CHECK-NEXT:     v29 = vmux(q3,v5,v2)
; CHECK-NEXT:     q2 = vcmp.eq(v7.w,v5.w)
; CHECK-NEXT:     vmem(r1+#0) = v29.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28 = vor(v19,v26)
; CHECK-NEXT:     v30 = vor(v14,v15)
; CHECK-NEXT:     q3 = vcmp.eq(v6.w,v5.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v5,v28)
; CHECK-NEXT:     v31 = vmux(q3,v5,v30)
; CHECK-NEXT:     vmem(r1+#3) = v0.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#2) = v31
; CHECK-NEXT:    }
  %v0 = load <128 x i8>, ptr %a0, align 128
  %v1 = sitofp <128 x i8> %v0 to <128 x float>
  store <128 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input #1
define void @s8f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     v3:2.h = vunpack(v0.b)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vsplat(r0)
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     r6 = #512
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r3)
; CHECK-NEXT:     v3:2.w = vunpack(v2.h)
; CHECK-NEXT:     v22 = vxor(v22,v22)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10 = vsplat(r6)
; CHECK-NEXT:     r7 = ##-2147483648
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9 = vsplat(r7)
; CHECK-NEXT:     v4.w = vabs(v2.w)
; CHECK-NEXT:     v5.w = vabs(v3.w)
; CHECK-NEXT:     q0 = vcmp.gt(v22.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12 = vsplat(r5)
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v11 = vmux(q0,v9,v22)
; CHECK-NEXT:     q0 = vcmp.gt(v22.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vcl0(v4.uw)
; CHECK-NEXT:     v30 = vmux(q0,v9,v22)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.uw = vcl0(v5.uw)
; CHECK-NEXT:     v6.w = vadd(v6.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.w = vadd(v8.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vasl(v4.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vasl(v5.w,v8.w)
; CHECK-NEXT:     v13 = vand(v4,v10)
; CHECK-NEXT:     v14.w = vadd(v4.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10 = vand(v5,v10)
; CHECK-NEXT:     v7.w = vadd(v5.w,v7.w)
; CHECK-NEXT:     q2 = vcmp.gt(v4.uw,v14.uw)
; CHECK-NEXT:     q1 = vcmp.eq(v13.w,v22.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uw = vlsr(v14.uw,r2)
; CHECK-NEXT:     q3 = vcmp.eq(v10.w,v22.w)
; CHECK-NEXT:     v25 = vmux(q2,v1,v22)
; CHECK-NEXT:     q2 = vcmp.gt(v5.uw,v7.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uw = vlsr(v7.uw,r2)
; CHECK-NEXT:     v26 = vmux(q1,v22,v1)
; CHECK-NEXT:     v27 = vmux(q3,v22,v1)
; CHECK-NEXT:     v1 = vmux(q2,v1,v22)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT:     v5.w = vadd(v14.w,v26.w)
; CHECK-NEXT:     v29.w = vadd(v7.w,v27.w)
; CHECK-NEXT:     v6.w = vsub(v25.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v23.uw = vlsr(v4.uw,r2)
; CHECK-NEXT:     v1.w = vsub(v1.w,v8.w)
; CHECK-NEXT:     v6.w = vadd(v6.w,v12.w)
; CHECK-NEXT:     q3 = vcmp.eq(v24.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v14.uw,r0)
; CHECK-NEXT:     v1.w = vadd(v1.w,v12.w)
; CHECK-NEXT:     q1 = vcmp.eq(v23.w,v14.w)
; CHECK-NEXT:     q2 = vcmp.eq(v3.w,v22.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uw = vlsr(v7.uw,r0)
; CHECK-NEXT:     v5 = vmux(q1,v5,v28)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v29.uw,r0)
; CHECK-NEXT:     v5 = vor(v11,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v6.w,r4)
; CHECK-NEXT:     v4 = vmux(q3,v4,v7)
; CHECK-NEXT:     q3 = vcmp.eq(v2.w,v22.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r4)
; CHECK-NEXT:     v4 = vor(v30,v4)
; CHECK-NEXT:     v31 = vor(v5,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v4,v1)
; CHECK-NEXT:     v0 = vmux(q3,v22,v31)
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vmux(q2,v22,v1)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v1.new
; CHECK-NEXT:    }
  %v0 = load <64 x i8>, ptr %a0, align 128
  %v1 = sitofp <64 x i8> %v0 to <64 x float>
  store <64 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input #2
define void @s8f32_2(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_2:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v1:0.h = vunpack(v0.b)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r0)
; CHECK-NEXT:     v4 = vsplat(r3)
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v1:0.w = vunpack(v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vsplat(r2)
; CHECK-NEXT:     v8 = vsplat(r4)
; CHECK-NEXT:     v5.w = vabs(v0.w)
; CHECK-NEXT:     q2 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r7)
; CHECK-NEXT:     r2 = #23
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vcl0(v5.uw)
; CHECK-NEXT:     v30 = vmux(q2,v7,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v6.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vasl(v5.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vadd(v5.w,v1.w)
; CHECK-NEXT:     v4 = vand(v5,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT:     q0 = vcmp.eq(v4.w,v3.w)
; CHECK-NEXT:     q1 = vcmp.gt(v5.uw,v1.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT:     v4 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v1.w,v4.w)
; CHECK-NEXT:     v2.w = vsub(v2.w,v6.w)
; CHECK-NEXT:     q3 = vcmp.eq(v5.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v1.uw,r0)
; CHECK-NEXT:     v2.w = vadd(v2.w,v8.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uw = vlsr(v4.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r2)
; CHECK-NEXT:     v1 = vmux(q3,v29,v28)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v30,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v1,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v3,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i8>, ptr %a0, align 128
  %v1 = sitofp <32 x i8> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}


; s16 -> f16
; No widening
define void @s16f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r3:2 = combine(#64,#31)
; CHECK-NEXT:     v1.h = vabs(v0.h)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vsplat(r6)
; CHECK-NEXT:     v5.h = vsplat(r2)
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vsplat(r3)
; CHECK-NEXT:     r5:4 = combine(##32768,#5)
; CHECK-NEXT:     v4.uh = vcl0(v1.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.h = vsplat(r5)
; CHECK-NEXT:     r2 = #10
; CHECK-NEXT:     v4.h = vadd(v4.h,v3.h)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v1.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v1.h,v5.h)
; CHECK-NEXT:     v6 = vand(v1,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uh = vlsr(v1.uh,r4)
; CHECK-NEXT:     q0 = vcmp.eq(v6.h,v2.h)
; CHECK-NEXT:     q1 = vcmp.gt(v1.uh,v7.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uh = vlsr(v7.uh,r4)
; CHECK-NEXT:     v26 = vmux(q0,v2,v3)
; CHECK-NEXT:     v3 = vmux(q1,v3,v2)
; CHECK-NEXT:     q1 = vcmp.gt(v2.h,v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT:     v3.h = vadd(v3.h,v5.h)
; CHECK-NEXT:     q2 = vcmp.eq(v1.h,v25.h)
; CHECK-NEXT:     v30 = vmux(q1,v8,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uh = vlsr(v25.uh,r6)
; CHECK-NEXT:     v28.h = vsub(v3.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v7.uh,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v28.h,r2)
; CHECK-NEXT:     v3 = vmux(q2,v29,v27)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v30,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v2,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i16>, ptr %a0, align 128
  %v1 = sitofp <64 x i16> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen input and result
define void @s16f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(#31,#1)
; CHECK-NEXT:     r7 = #64
; CHECK-NEXT:     v1.h = vabs(v0.h)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vsplat(r2)
; CHECK-NEXT:     v5.h = vsplat(r3)
; CHECK-NEXT:     r6 = #5
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vsplat(r7)
; CHECK-NEXT:     r4 = ##32768
; CHECK-NEXT:     v4.uh = vcl0(v1.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.h = vsplat(r4)
; CHECK-NEXT:     r4 = #10
; CHECK-NEXT:     q2 = vcmp.gt(v3.h,v0.h)
; CHECK-NEXT:     v4.h = vadd(v4.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v8,v3)
; CHECK-NEXT:     q2 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v1.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v1.h,v5.h)
; CHECK-NEXT:     v6 = vand(v1,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uh = vlsr(v1.uh,r6)
; CHECK-NEXT:     q1 = vcmp.eq(v6.h,v3.h)
; CHECK-NEXT:     q0 = vcmp.gt(v1.uh,v7.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uh = vlsr(v7.uh,r6)
; CHECK-NEXT:     v26 = vmux(q1,v3,v2)
; CHECK-NEXT:     v2 = vmux(q0,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT:     v2.h = vadd(v2.h,v5.h)
; CHECK-NEXT:     q3 = vcmp.eq(v1.h,v25.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uh = vlsr(v25.uh,r2)
; CHECK-NEXT:     v28.h = vsub(v2.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v7.uh,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v28.h,r4)
; CHECK-NEXT:     q3 = vsetq(r7)
; CHECK-NEXT:     v2 = vmux(q3,v29,v27)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vor(v30,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v3,v31)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <32 x i16>, ptr %a0, align 128
  %v1 = sitofp <32 x i16> %v0 to <32 x half>
  store <32 x half> %v1, ptr %a1, align 128
  ret void
}


; s16 -> f32
; No widening
define void @s16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     v1:0.w = vunpack(v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r0)
; CHECK-NEXT:     r7 = #512
; CHECK-NEXT:     v4.w = vabs(v0.w)
; CHECK-NEXT:     v6.w = vabs(v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     v9 = vsplat(r7)
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13 = vsplat(r5)
; CHECK-NEXT:     r6 = ##-2147483648
; CHECK-NEXT:     v7.uw = vcl0(v4.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10 = vsplat(r6)
; CHECK-NEXT:     v8.uw = vcl0(v6.uw)
; CHECK-NEXT:     q0 = vcmp.gt(v2.w,v0.w)
; CHECK-NEXT:     v7.w = vadd(v7.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v8.w = vadd(v8.w,v3.w)
; CHECK-NEXT:     v27 = vmux(q0,v10,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vasl(v4.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v6.w,v8.w)
; CHECK-NEXT:     v11.w = vadd(v4.w,v5.w)
; CHECK-NEXT:     v12 = vand(v4,v9)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v6.w,v5.w)
; CHECK-NEXT:     v9 = vand(v6,v9)
; CHECK-NEXT:     q1 = vcmp.eq(v12.w,v2.w)
; CHECK-NEXT:     q2 = vcmp.gt(v4.uw,v11.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v22.uw = vlsr(v11.uw,r2)
; CHECK-NEXT:     q3 = vcmp.eq(v9.w,v2.w)
; CHECK-NEXT:     v23 = vmux(q1,v2,v3)
; CHECK-NEXT:     v14 = vmux(q2,v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT:     v11.w = vadd(v22.w,v23.w)
; CHECK-NEXT:     q2 = vcmp.gt(v6.uw,v5.uw)
; CHECK-NEXT:     v25 = vmux(q3,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v4.uw,r2)
; CHECK-NEXT:     v5.w = vadd(v24.w,v25.w)
; CHECK-NEXT:     v3 = vmux(q2,v3,v2)
; CHECK-NEXT:     v7.w = vsub(v14.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT:     v3.w = vsub(v3.w,v8.w)
; CHECK-NEXT:     q3 = vcmp.eq(v21.w,v22.w)
; CHECK-NEXT:     v7.w = vadd(v7.w,v13.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v22.uw,r0)
; CHECK-NEXT:     v3.w = vadd(v3.w,v13.w)
; CHECK-NEXT:     q2 = vcmp.eq(v6.w,v24.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.uw = vlsr(v11.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r0)
; CHECK-NEXT:     v4 = vmux(q3,v11,v4)
; CHECK-NEXT:     q3 = vcmp.gt(v2.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uw = vlsr(v24.uw,r0)
; CHECK-NEXT:     v28 = vmux(q3,v10,v2)
; CHECK-NEXT:     v4 = vor(v27,v4)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.w = vasl(v7.w,r4)
; CHECK-NEXT:     v5 = vmux(q2,v5,v26)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.w = vasl(v3.w,r4)
; CHECK-NEXT:     v5 = vor(v28,v5)
; CHECK-NEXT:     v29 = vor(v4,v7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v5,v3)
; CHECK-NEXT:     v31 = vmux(q3,v2,v29)
; CHECK-NEXT:     vmem(r1+#0) = v31.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v2,v3)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v30.new
; CHECK-NEXT:    }
  %v0 = load <64 x i16>, ptr %a0, align 128
  %v1 = sitofp <64 x i16> %v0 to <64 x float>
  store <64 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input
define void @s16f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v1:0.w = vunpack(v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r0)
; CHECK-NEXT:     v4 = vsplat(r2)
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v2.w = vabs(v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT:     v1 = vxor(v1,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v5.uw = vcl0(v2.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     v29 = vsplat(r7)
; CHECK-NEXT:     q2 = vcmp.gt(v1.w,v0.w)
; CHECK-NEXT:     v5.w = vadd(v5.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r2 = #23
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,v5.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v2.w,v4.w)
; CHECK-NEXT:     v6 = vand(v2,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.uw = vlsr(v2.uw,r6)
; CHECK-NEXT:     q0 = vcmp.eq(v6.w,v1.w)
; CHECK-NEXT:     q1 = vcmp.gt(v2.uw,v4.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v4.uw,r6)
; CHECK-NEXT:     v6 = vmux(q0,v1,v3)
; CHECK-NEXT:     v3 = vmux(q1,v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v4.w,v6.w)
; CHECK-NEXT:     v27.w = vsub(v3.w,v5.w)
; CHECK-NEXT:     q3 = vcmp.eq(v2.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v4.uw,r0)
; CHECK-NEXT:     v2.w = vadd(v27.w,v7.w)
; CHECK-NEXT:     v4 = vmux(q2,v29,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r2)
; CHECK-NEXT:     v3 = vmux(q3,v30,v28)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v4,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v1,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i16>, ptr %a0, align 128
  %v1 = sitofp <32 x i16> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}


; s32 -> f16
; No widening
define void @s32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(#8,#1)
; CHECK-NEXT:     r6 = #255
; CHECK-NEXT:     v6.w = vabs(v1.w)
; CHECK-NEXT:     v1.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r2)
; CHECK-NEXT:     r4 = #512
; CHECK-NEXT:     v5.w = vabs(v0.w)
; CHECK-NEXT:     v0.cur = vmem(r0+#1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9 = vsplat(r4)
; CHECK-NEXT:     v8 = vsplat(r6)
; CHECK-NEXT:     v3.uw = vcl0(v6.uw)
; CHECK-NEXT:     v20 = vxor(v20,v20)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v4.uw = vcl0(v5.uw)
; CHECK-NEXT:     v3.w = vadd(v3.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27 = vsplat(r4)
; CHECK-NEXT:     r5 = ##-2147483648
; CHECK-NEXT:     v7.w = vadd(v4.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13 = vsplat(r5)
; CHECK-NEXT:     v6.w = vasl(v6.w,v3.w)
; CHECK-NEXT:     q0 = vcmp.gt(v20.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vasl(v5.w,v7.w)
; CHECK-NEXT:     v26 = vmux(q0,v13,v20)
; CHECK-NEXT:     v10.w = vadd(v6.w,v8.w)
; CHECK-NEXT:     v11 = vand(v6,v9)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9 = vand(v5,v9)
; CHECK-NEXT:     q3 = vcmp.eq(v11.w,v20.w)
; CHECK-NEXT:     v8.w = vadd(v5.w,v8.w)
; CHECK-NEXT:     q1 = vcmp.gt(v6.uw,v10.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v10.uw,r3)
; CHECK-NEXT:     q2 = vcmp.eq(v9.w,v20.w)
; CHECK-NEXT:     v22 = vmux(q3,v20,v2)
; CHECK-NEXT:     q3 = vcmp.gt(v5.uw,v8.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.uw = vlsr(v8.uw,r3)
; CHECK-NEXT:     v9.w = vadd(v21.w,v22.w)
; CHECK-NEXT:     v24 = vmux(q2,v20,v2)
; CHECK-NEXT:     v23 = vmux(q1,v2,v20)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uw = vlsr(v6.uw,r3)
; CHECK-NEXT:     v2 = vmux(q3,v2,v20)
; CHECK-NEXT:     v25.w = vadd(v8.w,v24.w)
; CHECK-NEXT:     v3.w = vsub(v23.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r3)
; CHECK-NEXT:     v2.w = vsub(v2.w,v7.w)
; CHECK-NEXT:     q3 = vcmp.eq(v12.w,v21.w)
; CHECK-NEXT:     v3.w = vadd(v3.w,v27.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     v6.uw = vlsr(v21.uw,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v5.w,v8.w)
; CHECK-NEXT:     v2.w = vadd(v2.w,v27.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.uw = vlsr(v9.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v25.uw,r2)
; CHECK-NEXT:     v6 = vmux(q3,v9,v6)
; CHECK-NEXT:     q3 = vcmp.gt(v20.w,v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uw = vlsr(v8.uw,r2)
; CHECK-NEXT:     v30 = vmux(q3,v13,v20)
; CHECK-NEXT:     v6 = vor(v26,v6)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v20.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.w = vasl(v3.w,r3)
; CHECK-NEXT:     v5 = vmux(q2,v28,v29)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v20.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r3)
; CHECK-NEXT:     v31 = vor(v30,v5)
; CHECK-NEXT:     v3 = vor(v6,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v31,v2)
; CHECK-NEXT:     v3 = vmux(q2,v20,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v20,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.qf32 = vadd(v3.sf,v20.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.qf32 = vadd(v0.sf,v20.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.hf = v3:2.qf32
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.h = vdeal(v0.h)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i32>, ptr %a0, align 128
  %v1 = sitofp <64 x i32> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen result
define void @s32f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     v1.w = vabs(v0.w)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r6)
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     r4 = #512
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     v6 = vsplat(r4)
; CHECK-NEXT:     v4.uw = vcl0(v1.uw)
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:     r4 = ##-2147483648
; CHECK-NEXT:     v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28 = vsplat(r5)
; CHECK-NEXT:     v29 = vsplat(r4)
; CHECK-NEXT:     q3 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT:     v31 = vmux(q3,v29,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT:     v6 = vand(v1,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uw = vlsr(v1.uw,r2)
; CHECK-NEXT:     q0 = vcmp.eq(v6.w,v3.w)
; CHECK-NEXT:     q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r2 = #64
; CHECK-NEXT:     v1.uw = vlsr(v5.uw,r2)
; CHECK-NEXT:     v27 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     q3 = vsetq(r2)
; CHECK-NEXT:     v5.w = vadd(v1.w,v27.w)
; CHECK-NEXT:     v2.w = vsub(v2.w,v4.w)
; CHECK-NEXT:     q2 = vcmp.eq(v7.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT:     v2.w = vadd(v2.w,v28.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v5.uw,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r3)
; CHECK-NEXT:     v1 = vmux(q2,v30,v1)
; CHECK-NEXT:     q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v31,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.qf32 = vadd(v3.sf,v3.sf)
; CHECK-NEXT:     v0 = vor(v1,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v3,v0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.qf32 = vadd(v0.sf,v3.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.hf = v1:0.qf32
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.h = vdeal(v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <32 x i32>, ptr %a0, align 128
  %v1 = sitofp <32 x i32> %v0 to <32 x half>
  store <32 x half> %v1, ptr %a1, align 128
  ret void
}

; s32 -> f32
; No widening
define void @s32f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v1.w = vabs(v0.w)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r0)
; CHECK-NEXT:     v5 = vsplat(r2)
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT:     v4.uw = vcl0(v1.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v4.w = vadd(v4.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     v29 = vsplat(r7)
; CHECK-NEXT:     r2 = #23
; CHECK-NEXT:     q2 = vcmp.gt(v2.w,v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT:     v6 = vand(v1,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT:     q0 = vcmp.eq(v6.w,v2.w)
; CHECK-NEXT:     q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT:     v6 = vmux(q0,v2,v3)
; CHECK-NEXT:     v3 = vmux(q1,v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v5.w,v6.w)
; CHECK-NEXT:     v27.w = vsub(v3.w,v4.w)
; CHECK-NEXT:     q3 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT:     v4 = vmux(q2,v29,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v5.uw,r0)
; CHECK-NEXT:     v1.w = vadd(v27.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r2)
; CHECK-NEXT:     v3 = vmux(q3,v30,v28)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v4,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v2,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i32>, ptr %a0, align 128
  %v1 = sitofp <32 x i32> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input and result
define void @s32f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v1.w = vabs(v0.w)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r0)
; CHECK-NEXT:     v5 = vsplat(r2)
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT:     v4.uw = vcl0(v1.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     v29 = vsplat(r7)
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     q3 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r2 = #64
; CHECK-NEXT:     v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT:     v6 = vand(v1,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT:     q0 = vcmp.eq(v6.w,v3.w)
; CHECK-NEXT:     q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT:     v6 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v5.w,v6.w)
; CHECK-NEXT:     v27.w = vsub(v2.w,v4.w)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT:     v4 = vmux(q3,v29,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v5.uw,r0)
; CHECK-NEXT:     q3 = vsetq(r2)
; CHECK-NEXT:     v1.w = vadd(v27.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r3)
; CHECK-NEXT:     v2 = vmux(q2,v30,v28)
; CHECK-NEXT:     q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vor(v4,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v3,v31)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <16 x i32>, ptr %a0, align 128
  %v1 = sitofp <16 x i32> %v0 to <16 x float>
  store <16 x float> %v1, ptr %a1, align 128
  ret void
}


; u8 -> f16
; No widening
define void @u8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r3:2 = combine(#31,#5)
; CHECK-NEXT:     v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vsplat(r6)
; CHECK-NEXT:     v4.h = vsplat(r3)
; CHECK-NEXT:     r5 = #64
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vsplat(r5)
; CHECK-NEXT:     r4 = #10
; CHECK-NEXT:     v5.uh = vcl0(v0.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uh = vcl0(v1.uh)
; CHECK-NEXT:     v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v7.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.h = vasl(v0.h,v5.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.h = vasl(v1.h,v7.h)
; CHECK-NEXT:     v10 = vand(v8,v6)
; CHECK-NEXT:     v9.h = vadd(v8.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v22.h = vadd(v11.h,v4.h)
; CHECK-NEXT:     v6 = vand(v11,v6)
; CHECK-NEXT:     q0 = vcmp.gt(v8.uh,v9.uh)
; CHECK-NEXT:     q1 = vcmp.eq(v10.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uh = vlsr(v8.uh,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v6.h,v2.h)
; CHECK-NEXT:     q3 = vcmp.gt(v11.uh,v22.uh)
; CHECK-NEXT:     v12 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.uh = vlsr(v9.uh,r2)
; CHECK-NEXT:     v13 = vmux(q2,v2,v3)
; CHECK-NEXT:     v25 = vmux(q0,v3,v2)
; CHECK-NEXT:     v3 = vmux(q3,v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.uh = vlsr(v22.uh,r2)
; CHECK-NEXT:     v24.h = vadd(v9.h,v12.h)
; CHECK-NEXT:     v3.h = vadd(v3.h,v4.h)
; CHECK-NEXT:     v12.h = vadd(v25.h,v4.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v23.uh = vlsr(v11.uh,r2)
; CHECK-NEXT:     v13.h = vadd(v8.h,v13.h)
; CHECK-NEXT:     v5.h = vsub(v12.h,v5.h)
; CHECK-NEXT:     v3.h = vsub(v3.h,v7.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uh = vlsr(v9.uh,r6)
; CHECK-NEXT:     q2 = vcmp.eq(v21.h,v9.h)
; CHECK-NEXT:     q3 = vcmp.eq(v23.h,v8.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uh = vlsr(v24.uh,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uh = vlsr(v13.uh,r6)
; CHECK-NEXT:     v4 = vmux(q2,v26,v14)
; CHECK-NEXT:     q2 = vcmp.eq(v1.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uh = vlsr(v8.uh,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.h = vasl(v5.h,r4)
; CHECK-NEXT:     v6 = vmux(q3,v27,v28)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vasl(v3.h,r4)
; CHECK-NEXT:     v29 = vor(v4,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v6,v3)
; CHECK-NEXT:     v31 = vmux(q3,v2,v29)
; CHECK-NEXT:     vmem(r1+#0) = v31.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v2,v3)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v30.new
; CHECK-NEXT:    }
  %v0 = load <128 x i8>, ptr %a0, align 128
  %v1 = uitofp <128 x i8> %v0 to <128 x half>
  store <128 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen input
define void @u8f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r3:2 = combine(#64,#31)
; CHECK-NEXT:     v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vsplat(r6)
; CHECK-NEXT:     v4.h = vsplat(r2)
; CHECK-NEXT:     r5 = #5
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.h = vsplat(r3)
; CHECK-NEXT:     r4 = #10
; CHECK-NEXT:     v3.uh = vcl0(v0.uh)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.h = vadd(v3.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vasl(v0.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uh = vlsr(v6.uh,r5)
; CHECK-NEXT:     q0 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT:     q1 = vcmp.eq(v5.h,v2.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uh = vlsr(v7.uh,r5)
; CHECK-NEXT:     v27 = vmux(q1,v2,v1)
; CHECK-NEXT:     v1 = vmux(q0,v1,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vadd(v1.h,v4.h)
; CHECK-NEXT:     v28.h = vadd(v26.h,v27.h)
; CHECK-NEXT:     q2 = vcmp.eq(v6.h,v26.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v26.uh,r6)
; CHECK-NEXT:     v1.h = vsub(v1.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uh = vlsr(v28.uh,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v1.h,r4)
; CHECK-NEXT:     v3 = vmux(q2,v30,v29)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v2,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i8>, ptr %a0, align 128
  %v1 = uitofp <64 x i8> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}


; u8 -> f32
; No widening
define void @u8f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r7 = #64
; CHECK-NEXT:     r0 = #1
; CHECK-NEXT:     r6 = #512
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4 = vsplat(r0)
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     v1 = valign(v0,v0,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v15 = vsplat(r6)
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:     v3:2.uh = vunpack(v0.ub)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v31:30.uh = vunpack(v1.ub)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3:2.uw = vunpack(v2.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1:0.uw = vunpack(v30.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vcl0(v2.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uw = vcl0(v0.uw)
; CHECK-NEXT:     v5.w = vadd(v5.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.uw = vcl0(v3.uw)
; CHECK-NEXT:     v11.w = vadd(v7.w,v4.w)
; CHECK-NEXT:     v7 = vxor(v7,v7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.uw = vcl0(v1.uw)
; CHECK-NEXT:     v10.w = vadd(v8.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9 = vsplat(r5)
; CHECK-NEXT:     v14.w = vasl(v0.w,v11.w)
; CHECK-NEXT:     v8.w = vadd(v9.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.w = vasl(v2.w,v5.w)
; CHECK-NEXT:     v24 = vand(v14,v15)
; CHECK-NEXT:     v20.w = vadd(v14.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13.w = vasl(v3.w,v10.w)
; CHECK-NEXT:     v19 = vand(v12,v15)
; CHECK-NEXT:     q3 = vcmp.eq(v24.w,v7.w)
; CHECK-NEXT:     v18.w = vadd(v12.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v16.w = vasl(v1.w,v8.w)
; CHECK-NEXT:     v23 = vand(v13,v15)
; CHECK-NEXT:     v22.w = vadd(v13.w,v6.w)
; CHECK-NEXT:     q0 = vcmp.gt(v14.uw,v20.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v16.w,v6.w)
; CHECK-NEXT:     v15 = vand(v16,v15)
; CHECK-NEXT:     v30 = vmux(q3,v7,v4)
; CHECK-NEXT:     q2 = vcmp.eq(v19.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v14.uw,r2)
; CHECK-NEXT:     q3 = vcmp.eq(v15.w,v7.w)
; CHECK-NEXT:     v28 = vmux(q0,v4,v7)
; CHECK-NEXT:     q1 = vcmp.eq(v23.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14.uw = vlsr(v20.uw,r2)
; CHECK-NEXT:     v26 = vmux(q3,v7,v4)
; CHECK-NEXT:     v11.w = vsub(v28.w,v11.w)
; CHECK-NEXT:     q3 = vcmp.gt(v13.uw,v22.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v15.uw = vlsr(v6.uw,r2)
; CHECK-NEXT:     v20.w = vadd(v14.w,v30.w)
; CHECK-NEXT:     v30 = vmux(q1,v7,v4)
; CHECK-NEXT:     v31 = vmux(q2,v7,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v19.uw = vlsr(v18.uw,r2)
; CHECK-NEXT:     v29.w = vadd(v15.w,v26.w)
; CHECK-NEXT:     q1 = vcmp.gt(v12.uw,v18.uw)
; CHECK-NEXT:     v11.w = vadd(v11.w,v9.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v22.uw,r2)
; CHECK-NEXT:     v23.w = vadd(v19.w,v31.w)
; CHECK-NEXT:     v22 = vmux(q3,v4,v7)
; CHECK-NEXT:     q3 = vcmp.gt(v16.uw,v6.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.uw = vlsr(v29.uw,r0)
; CHECK-NEXT:     v31.w = vadd(v28.w,v30.w)
; CHECK-NEXT:     v30 = vmux(q1,v4,v7)
; CHECK-NEXT:     v4 = vmux(q3,v4,v7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v17.uw = vlsr(v12.uw,r2)
; CHECK-NEXT:     v5.w = vsub(v30.w,v5.w)
; CHECK-NEXT:     v29.w = vsub(v22.w,v10.w)
; CHECK-NEXT:     v4.w = vsub(v4.w,v8.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13.uw = vlsr(v13.uw,r2)
; CHECK-NEXT:     v6.w = vadd(v29.w,v9.w)
; CHECK-NEXT:     v5.w = vadd(v5.w,v9.w)
; CHECK-NEXT:     q0 = vcmp.eq(v21.w,v14.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v25.uw = vlsr(v16.uw,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v17.w,v19.w)
; CHECK-NEXT:     q3 = vcmp.eq(v13.w,v28.w)
; CHECK-NEXT:     v4.w = vadd(v4.w,v9.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v23.uw,r0)
; CHECK-NEXT:     q1 = vcmp.eq(v25.w,v15.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v23.uw = vlsr(v19.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31.uw = vlsr(v31.uw,r0)
; CHECK-NEXT:     v23 = vmux(q2,v21,v23)
; CHECK-NEXT:     q2 = vcmp.eq(v3.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v16.uw = vlsr(v28.uw,r0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uw = vlsr(v15.uw,r0)
; CHECK-NEXT:     v8 = vmux(q3,v31,v16)
; CHECK-NEXT:     q3 = vcmp.eq(v2.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v6.w,r4)
; CHECK-NEXT:     v22 = vmux(q1,v24,v26)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vasl(v5.w,r4)
; CHECK-NEXT:     v6 = vor(v8,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uw = vlsr(v14.uw,r0)
; CHECK-NEXT:     v25 = vor(v23,v5)
; CHECK-NEXT:     v26 = vmux(q2,v7,v6)
; CHECK-NEXT:     vmem(r1+#1) = v26.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v20.uw = vlsr(v20.uw,r0)
; CHECK-NEXT:     v28 = vmux(q3,v7,v25)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v7.w)
; CHECK-NEXT:     vmem(r1+#0) = v28.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.w = vasl(v11.w,r4)
; CHECK-NEXT:     v20 = vmux(q0,v20,v27)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.w = vasl(v4.w,r4)
; CHECK-NEXT:     v29 = vor(v20,v11)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27 = vor(v22,v24)
; CHECK-NEXT:     v31 = vmux(q3,v7,v29)
; CHECK-NEXT:     vmem(r1+#2) = v31.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v7,v27)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#3) = v30.new
; CHECK-NEXT:    }
  %v0 = load <128 x i8>, ptr %a0, align 128
  %v1 = uitofp <128 x i8> %v0 to <128 x float>
  store <128 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input #1
define void @u8f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r7 = #1
; CHECK-NEXT:     r6 = #512
; CHECK-NEXT:     v3:2.uh = vunpack(v0.ub)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vsplat(r7)
; CHECK-NEXT:     v8 = vsplat(r6)
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:     v3:2.uw = vunpack(v2.uh)
; CHECK-NEXT:     v21 = vxor(v21,v21)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13 = vsplat(r5)
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vcl0(v2.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vcl0(v3.uw)
; CHECK-NEXT:     v4.w = vadd(v4.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v5.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.w = vasl(v2.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.w = vasl(v3.w,v5.w)
; CHECK-NEXT:     v11 = vand(v7,v8)
; CHECK-NEXT:     v10.w = vadd(v7.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v9.w,v6.w)
; CHECK-NEXT:     q1 = vcmp.eq(v11.w,v21.w)
; CHECK-NEXT:     v8 = vand(v9,v8)
; CHECK-NEXT:     q0 = vcmp.gt(v7.uw,v10.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v22.uw = vlsr(v10.uw,r2)
; CHECK-NEXT:     v24 = vmux(q1,v21,v1)
; CHECK-NEXT:     q3 = vcmp.eq(v8.w,v21.w)
; CHECK-NEXT:     q1 = vcmp.gt(v9.uw,v6.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v23.uw = vlsr(v6.uw,r2)
; CHECK-NEXT:     v25 = vmux(q0,v1,v21)
; CHECK-NEXT:     v27 = vmux(q3,v21,v1)
; CHECK-NEXT:     v1 = vmux(q1,v1,v21)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vsub(v25.w,v4.w)
; CHECK-NEXT:     v1.w = vsub(v1.w,v5.w)
; CHECK-NEXT:     v10.w = vadd(v22.w,v24.w)
; CHECK-NEXT:     v28.w = vadd(v23.w,v27.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uw = vlsr(v7.uw,r2)
; CHECK-NEXT:     v4.w = vadd(v4.w,v13.w)
; CHECK-NEXT:     v1.w = vadd(v1.w,v13.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uw = vlsr(v9.uw,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v12.w,v22.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v11.uw = vlsr(v22.uw,r7)
; CHECK-NEXT:     q3 = vcmp.eq(v26.w,v23.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v10.uw,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uw = vlsr(v23.uw,r7)
; CHECK-NEXT:     v5 = vmux(q2,v30,v11)
; CHECK-NEXT:     q2 = vcmp.eq(v3.w,v21.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v28.uw,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vasl(v4.w,r4)
; CHECK-NEXT:     v6 = vmux(q3,v6,v29)
; CHECK-NEXT:     q3 = vcmp.eq(v2.w,v21.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r4)
; CHECK-NEXT:     v31 = vor(v5,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v6,v1)
; CHECK-NEXT:     v0 = vmux(q3,v21,v31)
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vmux(q2,v21,v1)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v1.new
; CHECK-NEXT:    }
  %v0 = load <64 x i8>, ptr %a0, align 128
  %v1 = uitofp <64 x i8> %v0 to <64 x float>
  store <64 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input #2
define void @u8f32_2(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_2:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r6)
; CHECK-NEXT:     v4 = vsplat(r3)
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r5:4 = combine(##159,#8)
; CHECK-NEXT:     v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vsplat(r2)
; CHECK-NEXT:     v7 = vsplat(r5)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vcl0(v0.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v5.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v0.w,v5.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vadd(v6.w,v1.w)
; CHECK-NEXT:     v4 = vand(v6,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r4)
; CHECK-NEXT:     q0 = vcmp.gt(v6.uw,v1.uw)
; CHECK-NEXT:     q1 = vcmp.eq(v4.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v1.uw = vlsr(v1.uw,r4)
; CHECK-NEXT:     v4 = vmux(q1,v3,v2)
; CHECK-NEXT:     v2 = vmux(q0,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vsub(v2.w,v5.w)
; CHECK-NEXT:     v4.w = vadd(v1.w,v4.w)
; CHECK-NEXT:     q2 = vcmp.eq(v6.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uw = vlsr(v1.uw,r6)
; CHECK-NEXT:     v2.w = vadd(v2.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v4.uw,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v2.w,r4)
; CHECK-NEXT:     v1 = vmux(q2,v30,v29)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v1,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v3,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i8>, ptr %a0, align 128
  %v1 = uitofp <32 x i8> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}


; u16 -> f16
; No widening
define void @u16f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(#64,#1)
; CHECK-NEXT:     r5 = #31
; CHECK-NEXT:     v1.uh = vcl0(v0.uh)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vsplat(r2)
; CHECK-NEXT:     v5.h = vsplat(r3)
; CHECK-NEXT:     r4 = #5
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.h = vsplat(r5)
; CHECK-NEXT:     r3 = #10
; CHECK-NEXT:     v1.h = vadd(v1.h,v2.h)
; CHECK-NEXT:     q3 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vasl(v0.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uh = vlsr(v6.uh,r4)
; CHECK-NEXT:     q0 = vcmp.eq(v5.h,v3.h)
; CHECK-NEXT:     q1 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uh = vlsr(v7.uh,r4)
; CHECK-NEXT:     v27 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vadd(v2.h,v4.h)
; CHECK-NEXT:     v28.h = vadd(v26.h,v27.h)
; CHECK-NEXT:     q2 = vcmp.eq(v6.h,v26.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v26.uh,r2)
; CHECK-NEXT:     v1.h = vsub(v2.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uh = vlsr(v28.uh,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v1.h,r3)
; CHECK-NEXT:     v2 = vmux(q2,v30,v29)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v3,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i16>, ptr %a0, align 128
  %v1 = uitofp <64 x i16> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen input and result
define void @u16f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(#31,#1)
; CHECK-NEXT:     r6 = #64
; CHECK-NEXT:     v1.uh = vcl0(v0.uh)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vsplat(r2)
; CHECK-NEXT:     v4.h = vsplat(r3)
; CHECK-NEXT:     r5 = #5
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.h = vsplat(r6)
; CHECK-NEXT:     r4 = #10
; CHECK-NEXT:     v1.h = vadd(v1.h,v2.h)
; CHECK-NEXT:     q2 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     q3 = vsetq(r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.h = vasl(v0.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uh = vlsr(v6.uh,r5)
; CHECK-NEXT:     q1 = vcmp.eq(v5.h,v3.h)
; CHECK-NEXT:     q0 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uh = vlsr(v7.uh,r5)
; CHECK-NEXT:     v5 = vmux(q1,v3,v2)
; CHECK-NEXT:     v2 = vmux(q0,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.h = vadd(v2.h,v4.h)
; CHECK-NEXT:     v28.h = vadd(v7.h,v5.h)
; CHECK-NEXT:     q1 = vcmp.eq(v6.h,v7.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v29.uh = vlsr(v7.uh,r2)
; CHECK-NEXT:     v1.h = vsub(v2.h,v1.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uh = vlsr(v28.uh,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.h = vasl(v1.h,r4)
; CHECK-NEXT:     v2 = vmux(q1,v30,v29)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v3,v31)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <32 x i16>, ptr %a0, align 128
  %v1 = uitofp <32 x i16> %v0 to <32 x half>
  store <32 x half> %v1, ptr %a1, align 128
  ret void
}


; u16 -> f32
; No widening
define void @u16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r7 = #1
; CHECK-NEXT:     r3:2 = combine(##255,#8)
; CHECK-NEXT:     v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r7)
; CHECK-NEXT:     v6 = vsplat(r3)
; CHECK-NEXT:     r6 = #512
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8 = vsplat(r6)
; CHECK-NEXT:     r5 = #159
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v4.uw = vcl0(v0.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v14 = vsplat(r5)
; CHECK-NEXT:     v5.uw = vcl0(v1.uw)
; CHECK-NEXT:     v4.w = vadd(v4.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.w = vadd(v5.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.w = vasl(v0.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v9.w = vasl(v1.w,v5.w)
; CHECK-NEXT:     v10.w = vadd(v7.w,v6.w)
; CHECK-NEXT:     v11 = vand(v7,v8)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v9.w,v6.w)
; CHECK-NEXT:     v8 = vand(v9,v8)
; CHECK-NEXT:     q1 = vcmp.eq(v11.w,v2.w)
; CHECK-NEXT:     q0 = vcmp.gt(v7.uw,v10.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v19.uw = vlsr(v10.uw,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v8.w,v2.w)
; CHECK-NEXT:     q3 = vcmp.gt(v9.uw,v6.uw)
; CHECK-NEXT:     v20 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v21.uw = vlsr(v6.uw,r2)
; CHECK-NEXT:     v22 = vmux(q2,v2,v3)
; CHECK-NEXT:     v25 = vmux(q0,v3,v2)
; CHECK-NEXT:     v3 = vmux(q3,v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vsub(v25.w,v4.w)
; CHECK-NEXT:     v3.w = vsub(v3.w,v5.w)
; CHECK-NEXT:     v23.w = vadd(v19.w,v20.w)
; CHECK-NEXT:     v10.w = vadd(v21.w,v22.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uw = vlsr(v7.uw,r2)
; CHECK-NEXT:     v4.w = vadd(v4.w,v14.w)
; CHECK-NEXT:     v3.w = vadd(v3.w,v14.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v24.uw = vlsr(v9.uw,r2)
; CHECK-NEXT:     q2 = vcmp.eq(v12.w,v19.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v13.uw = vlsr(v19.uw,r7)
; CHECK-NEXT:     q3 = vcmp.eq(v24.w,v21.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v26.uw = vlsr(v23.uw,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uw = vlsr(v10.uw,r7)
; CHECK-NEXT:     v5 = vmux(q2,v26,v13)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v28.uw = vlsr(v21.uw,r7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vasl(v4.w,r4)
; CHECK-NEXT:     v6 = vmux(q3,v27,v28)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.w = vasl(v3.w,r4)
; CHECK-NEXT:     v29 = vor(v5,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vor(v6,v3)
; CHECK-NEXT:     v31 = vmux(q3,v2,v29)
; CHECK-NEXT:     vmem(r1+#0) = v31.new
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30 = vmux(q2,v2,v3)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#1) = v30.new
; CHECK-NEXT:    }
  %v0 = load <64 x i16>, ptr %a0, align 128
  %v1 = uitofp <64 x i16> %v0 to <64 x float>
  store <64 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input
define void @u16f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r6 = #1
; CHECK-NEXT:     r2 = #255
; CHECK-NEXT:     v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vsplat(r6)
; CHECK-NEXT:     v4 = vsplat(r2)
; CHECK-NEXT:     r3 = #512
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     r5:4 = combine(##159,#8)
; CHECK-NEXT:     v3.uw = vcl0(v0.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r5)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT:     v3.w = vadd(v3.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r4)
; CHECK-NEXT:     q0 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT:     q1 = vcmp.eq(v5.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #23
; CHECK-NEXT:     v4.uw = vlsr(v4.uw,r4)
; CHECK-NEXT:     v5 = vmux(q1,v2,v1)
; CHECK-NEXT:     v1 = vmux(q0,v1,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vsub(v1.w,v3.w)
; CHECK-NEXT:     v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT:     q2 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v4.uw,r6)
; CHECK-NEXT:     v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.uw = vlsr(v29.uw,r6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r4)
; CHECK-NEXT:     v3 = vmux(q2,v3,v30)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v2,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i16>, ptr %a0, align 128
  %v1 = uitofp <32 x i16> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}


; u32 -> f16
; No widening
define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f16_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(#8,#1)
; CHECK-NEXT:     r6 = #255
; CHECK-NEXT:     v3.uw = vcl0(v0.uw)
; CHECK-NEXT:     v0.cur = vmem(r0+#1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r2)
; CHECK-NEXT:     r4 = #512
; CHECK-NEXT:     v4.uw = vcl0(v1.uw)
; CHECK-NEXT:     v1.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     v6 = vsplat(r6)
; CHECK-NEXT:     v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT:     v3.w = vadd(v3.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v9 = vxor(v9,v9)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v10 = vsplat(r4)
; CHECK-NEXT:     v5.w = vasl(v1.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v8.w = vasl(v0.w,v3.w)
; CHECK-NEXT:     v11.w = vadd(v5.w,v6.w)
; CHECK-NEXT:     v13 = vand(v5,v7)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vadd(v8.w,v6.w)
; CHECK-NEXT:     v7 = vand(v8,v7)
; CHECK-NEXT:     q1 = vcmp.gt(v5.uw,v11.uw)
; CHECK-NEXT:     q2 = vcmp.eq(v13.w,v9.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v27.uw = vlsr(v11.uw,r3)
; CHECK-NEXT:     q3 = vcmp.gt(v8.uw,v6.uw)
; CHECK-NEXT:     q0 = vcmp.eq(v7.w,v9.w)
; CHECK-NEXT:     v28 = vmux(q2,v9,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r3)
; CHECK-NEXT:     v29 = vmux(q1,v2,v9)
; CHECK-NEXT:     v30 = vmux(q3,v2,v9)
; CHECK-NEXT:     v2 = vmux(q0,v9,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vsub(v29.w,v4.w)
; CHECK-NEXT:     v7.w = vadd(v27.w,v28.w)
; CHECK-NEXT:     v3.w = vsub(v30.w,v3.w)
; CHECK-NEXT:     v2.w = vadd(v6.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v12.uw = vlsr(v5.uw,r3)
; CHECK-NEXT:     v4.w = vadd(v4.w,v10.w)
; CHECK-NEXT:     v3.w = vadd(v3.w,v10.w)
; CHECK-NEXT:     q2 = vcmp.eq(v1.w,v9.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     v14.uw = vlsr(v8.uw,r3)
; CHECK-NEXT:     q3 = vcmp.eq(v12.w,v27.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v5.uw = vlsr(v27.uw,r2)
; CHECK-NEXT:     q1 = vcmp.eq(v14.w,v6.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7.uw = vlsr(v7.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.uw = vlsr(v2.uw,r2)
; CHECK-NEXT:     v5 = vmux(q3,v7,v5)
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v9.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vasl(v4.w,r3)
; CHECK-NEXT:     v31 = vmux(q1,v2,v6)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.w = vasl(v3.w,r3)
; CHECK-NEXT:     v4 = vor(v5,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1 = vor(v31,v2)
; CHECK-NEXT:     v3 = vmux(q2,v9,v4)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v9,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.qf32 = vadd(v3.sf,v9.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3.qf32 = vadd(v0.sf,v9.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.hf = v3:2.qf32
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.h = vdeal(v0.h)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <64 x i32>, ptr %a0, align 128
  %v1 = uitofp <64 x i32> %v0 to <64 x half>
  store <64 x half> %v1, ptr %a1, align 128
  ret void
}

; Widen result
define void @u32f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f16_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(##512,#1)
; CHECK-NEXT:     v1.uw = vcl0(v0.uw)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v3 = vsplat(r2)
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     r6 = #255
; CHECK-NEXT:     v2 = vxor(v2,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4 = vsplat(r6)
; CHECK-NEXT:     r5 = #8
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v1.w = vadd(v1.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     q2 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT:     q0 = vcmp.eq(v5.w,v2.w)
; CHECK-NEXT:     q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT:     v5 = vmux(q0,v2,v3)
; CHECK-NEXT:     v3 = vmux(q1,v3,v2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vsub(v3.w,v1.w)
; CHECK-NEXT:     v30.w = vadd(v4.w,v5.w)
; CHECK-NEXT:     q1 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31.uw = vlsr(v4.uw,r2)
; CHECK-NEXT:     v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r2 = #64
; CHECK-NEXT:     v3.uw = vlsr(v30.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r3)
; CHECK-NEXT:     q3 = vsetq(r2)
; CHECK-NEXT:     v3 = vmux(q1,v3,v31)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.qf32 = vadd(v2.sf,v2.sf)
; CHECK-NEXT:     v0 = vor(v3,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v2,v0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.qf32 = vadd(v0.sf,v2.sf)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.hf = v1:0.qf32
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0.h = vdeal(v0.h)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <32 x i32>, ptr %a0, align 128
  %v1 = uitofp <32 x i32> %v0 to <32 x half>
  store <32 x half> %v1, ptr %a1, align 128
  ret void
}

; u32 -> f32
; No widening
define void @u32f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f32_0:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(##512,#1)
; CHECK-NEXT:     v1.uw = vcl0(v0.uw)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r2)
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     r6 = #255
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4 = vsplat(r6)
; CHECK-NEXT:     r5 = #8
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v1.w = vadd(v1.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT:     q0 = vcmp.eq(v5.w,v3.w)
; CHECK-NEXT:     q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT:     v5 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vsub(v2.w,v1.w)
; CHECK-NEXT:     v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT:     q2 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v4.uw,r2)
; CHECK-NEXT:     v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2.uw = vlsr(v29.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r3)
; CHECK-NEXT:     v2 = vmux(q2,v2,v30)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q3,v3,v31)
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     vmem(r1+#0) = v0.new
; CHECK-NEXT:    }
  %v0 = load <32 x i32>, ptr %a0, align 128
  %v1 = uitofp <32 x i32> %v0 to <32 x float>
  store <32 x float> %v1, ptr %a1, align 128
  ret void
}

; Widen input and result
define void @u32f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f32_1:
; CHECK:         .cfi_startproc
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    {
; CHECK-NEXT:     r3:2 = combine(##512,#1)
; CHECK-NEXT:     v1.uw = vcl0(v0.uw)
; CHECK-NEXT:     v0.cur = vmem(r0+#0)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v2 = vsplat(r2)
; CHECK-NEXT:     v5 = vsplat(r3)
; CHECK-NEXT:     r6 = #255
; CHECK-NEXT:     v3 = vxor(v3,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4 = vsplat(r6)
; CHECK-NEXT:     r5 = #8
; CHECK-NEXT:     r4 = #159
; CHECK-NEXT:     v1.w = vadd(v1.w,v2.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v7 = vsplat(r4)
; CHECK-NEXT:     r3 = #23
; CHECK-NEXT:     q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT:     v5 = vand(v6,v5)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT:     q0 = vcmp.eq(v5.w,v3.w)
; CHECK-NEXT:     q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT:     v5 = vmux(q0,v3,v2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v3)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vsub(v2.w,v1.w)
; CHECK-NEXT:     v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT:     q1 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v30.uw = vlsr(v4.uw,r2)
; CHECK-NEXT:     v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     r2 = #64
; CHECK-NEXT:     v2.uw = vlsr(v29.uw,r2)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v1.w = vasl(v1.w,r3)
; CHECK-NEXT:     q3 = vsetq(r2)
; CHECK-NEXT:     v2 = vmux(q1,v2,v30)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v31 = vor(v2,v1)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     v0 = vmux(q2,v3,v31)
; CHECK-NEXT:    }
; CHECK-NEXT:    {
; CHECK-NEXT:     jumpr r31
; CHECK-NEXT:     if (q3) vmem(r1+#0) = v0
; CHECK-NEXT:    }
  %v0 = load <16 x i32>, ptr %a0, align 128
  %v1 = uitofp <16 x i32> %v0 to <16 x float>
  store <16 x float> %v1, ptr %a1, align 128
  ret void
}


attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }