; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s -verify-machineinstrs | FileCheck %s
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"
; s8 -> f16
; No widening
define void @s8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #1
; CHECK-NEXT: r6 = #64
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r7)
; CHECK-NEXT: r3:2 = combine(#31,#5)
; CHECK-NEXT: v3.h = vabs(v0.h)
; CHECK-NEXT: v4.h = vabs(v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vsplat(r6)
; CHECK-NEXT: v7.h = vsplat(r3)
; CHECK-NEXT: v9 = vxor(v9,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r5 = ##32768
; CHECK-NEXT: v5.uh = vcl0(v3.uh)
; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10.h = vsplat(r5)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v6.uh = vcl0(v4.uh)
; CHECK-NEXT: v5.h = vadd(v5.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27 = vmux(q0,v10,v9)
; CHECK-NEXT: v6.h = vadd(v6.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vasl(v4.h,v6.h)
; CHECK-NEXT: v13 = vand(v3,v8)
; CHECK-NEXT: v11.h = vadd(v3.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.h = vadd(v4.h,v7.h)
; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h)
; CHECK-NEXT: v8 = vand(v4,v8)
; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2)
; CHECK-NEXT: v13 = vmux(q2,v9,v2)
; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h)
; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2)
; CHECK-NEXT: v22 = vmux(q2,v9,v2)
; CHECK-NEXT: v21 = vmux(q1,v2,v9)
; CHECK-NEXT: v2 = vmux(q3,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2)
; CHECK-NEXT: v13.h = vadd(v11.h,v13.h)
; CHECK-NEXT: v24.h = vadd(v20.h,v22.h)
; CHECK-NEXT: v2.h = vadd(v2.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2)
; CHECK-NEXT: v23.h = vadd(v21.h,v7.h)
; CHECK-NEXT: v2.h = vsub(v2.h,v6.h)
; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7)
; CHECK-NEXT: v3.h = vsub(v23.h,v5.h)
; CHECK-NEXT: q2 = vcmp.eq(v12.h,v11.h)
; CHECK-NEXT: q1 = vcmp.eq(v19.h,v20.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7)
; CHECK-NEXT: v28 = vmux(q3,v10,v9)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v9.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7)
; CHECK-NEXT: v5 = vmux(q2,v25,v11)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7)
; CHECK-NEXT: v5 = vor(v27,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,r4)
; CHECK-NEXT: v4 = vmux(q1,v26,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vasl(v2.h,r4)
; CHECK-NEXT: v4 = vor(v28,v4)
; CHECK-NEXT: v29 = vor(v5,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vor(v4,v2)
; CHECK-NEXT: v31 = vmux(q3,v9,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v9,v2)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = sitofp <128 x i8> %v0 to <128 x half>
store <128 x half> %v1, ptr %a1, align 128
ret void
}
; Widen input
define void @s8f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3:2 = combine(#64,#31)
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vsplat(r6)
; CHECK-NEXT: v4.h = vsplat(r2)
; CHECK-NEXT: v2.h = vabs(v0.h)
; CHECK-NEXT: v1 = vxor(v1,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r3)
; CHECK-NEXT: r5:4 = combine(##32768,#5)
; CHECK-NEXT: r2 = #10
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vsplat(r5)
; CHECK-NEXT: v5.uh = vcl0(v2.uh)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vasl(v2.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v2.h,v4.h)
; CHECK-NEXT: v6 = vand(v2,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4)
; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h)
; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
; CHECK-NEXT: v26 = vmux(q0,v1,v3)
; CHECK-NEXT: v3 = vmux(q1,v3,v1)
; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT: v3.h = vadd(v3.h,v4.h)
; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h)
; CHECK-NEXT: v30 = vmux(q1,v8,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
; CHECK-NEXT: v28.h = vsub(v3.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vasl(v28.h,r2)
; CHECK-NEXT: v3 = vmux(q2,v29,v27)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v30,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v1,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i8>, ptr %a0, align 128
%v1 = sitofp <64 x i8> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; s8 -> f32
; No widening
define void @s8f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #64
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r0)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1 = valign(v0,v0,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r3)
; CHECK-NEXT: r7 = #512
; CHECK-NEXT: v9:8.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r7)
; CHECK-NEXT: r6 = ##-2147483648
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v7:6.h = vunpack(v1.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8 = vsplat(r6)
; CHECK-NEXT: v1:0.w = vunpack(v8.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7:6.w = vunpack(v6.h)
; CHECK-NEXT: v5.w = vabs(v0.w)
; CHECK-NEXT: v10.w = vabs(v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.w = vabs(v6.w)
; CHECK-NEXT: v13.w = vabs(v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.uw = vcl0(v5.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vcl0(v26.uw)
; CHECK-NEXT: v9.w = vadd(v9.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uw = vcl0(v13.uw)
; CHECK-NEXT: v15.w = vadd(v12.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uw = vcl0(v10.uw)
; CHECK-NEXT: v12.w = vadd(v14.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.w = vasl(v26.w,v15.w)
; CHECK-NEXT: v11.w = vadd(v11.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.w = vasl(v13.w,v12.w)
; CHECK-NEXT: v20 = vand(v27,v4)
; CHECK-NEXT: v19.w = vadd(v27.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v16.w = vasl(v5.w,v9.w)
; CHECK-NEXT: v5 = vxor(v5,v5)
; CHECK-NEXT: v23.w = vadd(v13.w,v3.w)
; CHECK-NEXT: v28 = vand(v13,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v17.w = vasl(v10.w,v11.w)
; CHECK-NEXT: q3 = vcmp.eq(v20.w,v5.w)
; CHECK-NEXT: q2 = vcmp.gt(v27.uw,v19.uw)
; CHECK-NEXT: q0 = vcmp.gt(v5.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v27.uw,r2)
; CHECK-NEXT: v30 = vmux(q3,v5,v2)
; CHECK-NEXT: q3 = vcmp.eq(v28.w,v5.w)
; CHECK-NEXT: v22 = vand(v17,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uw = vlsr(v19.uw,r2)
; CHECK-NEXT: v27 = vmux(q3,v5,v2)
; CHECK-NEXT: q1 = vcmp.eq(v22.w,v5.w)
; CHECK-NEXT: v24 = vmux(q2,v2,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31.uw = vlsr(v23.uw,r2)
; CHECK-NEXT: v22.w = vadd(v14.w,v30.w)
; CHECK-NEXT: v30.w = vadd(v17.w,v3.w)
; CHECK-NEXT: q2 = vcmp.eq(v21.w,v14.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uw = vlsr(v13.uw,r2)
; CHECK-NEXT: v28.w = vadd(v31.w,v27.w)
; CHECK-NEXT: v3.w = vadd(v16.w,v3.w)
; CHECK-NEXT: v4 = vand(v16,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uw = vlsr(v14.uw,r0)
; CHECK-NEXT: q3 = vcmp.eq(v29.w,v31.w)
; CHECK-NEXT: v18 = vmux(q0,v8,v5)
; CHECK-NEXT: q0 = vcmp.gt(v5.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uw = vlsr(v31.uw,r0)
; CHECK-NEXT: v26 = vmux(q1,v5,v2)
; CHECK-NEXT: v31 = vmux(q0,v8,v5)
; CHECK-NEXT: q0 = vcmp.gt(v16.uw,v3.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r5)
; CHECK-NEXT: v29.uw = vlsr(v22.uw,r0)
; CHECK-NEXT: v15.w = vsub(v24.w,v15.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.uw = vlsr(v28.uw,r0)
; CHECK-NEXT: v14 = vmux(q2,v29,v14)
; CHECK-NEXT: q2 = vcmp.gt(v13.uw,v23.uw)
; CHECK-NEXT: v15.w = vadd(v15.w,v10.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2)
; CHECK-NEXT: v19 = vmux(q3,v20,v19)
; CHECK-NEXT: q3 = vcmp.eq(v4.w,v5.w)
; CHECK-NEXT: v27 = vmux(q2,v2,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2)
; CHECK-NEXT: q2 = vcmp.gt(v17.uw,v30.uw)
; CHECK-NEXT: v28.w = vadd(v25.w,v26.w)
; CHECK-NEXT: v29 = vmux(q3,v5,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v17.uw = vlsr(v17.uw,r2)
; CHECK-NEXT: v19 = vor(v31,v19)
; CHECK-NEXT: v31 = vmux(q2,v2,v5)
; CHECK-NEXT: v2 = vmux(q0,v2,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v16.uw,r2)
; CHECK-NEXT: v30.w = vadd(v3.w,v29.w)
; CHECK-NEXT: v2.w = vsub(v2.w,v9.w)
; CHECK-NEXT: v11.w = vsub(v31.w,v11.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0)
; CHECK-NEXT: q3 = vcmp.eq(v17.w,v25.w)
; CHECK-NEXT: v4.w = vsub(v27.w,v12.w)
; CHECK-NEXT: v2.w = vadd(v2.w,v10.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.uw = vlsr(v25.uw,r0)
; CHECK-NEXT: q0 = vcmp.eq(v24.w,v3.w)
; CHECK-NEXT: v21.w = vadd(v11.w,v10.w)
; CHECK-NEXT: q2 = vcmp.gt(v5.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.uw = vlsr(v30.uw,r0)
; CHECK-NEXT: v23 = vmux(q3,v16,v13)
; CHECK-NEXT: q3 = vcmp.gt(v5.w,v0.w)
; CHECK-NEXT: v24 = vmux(q2,v8,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.uw = vlsr(v3.uw,r0)
; CHECK-NEXT: v4.w = vadd(v4.w,v10.w)
; CHECK-NEXT: v8 = vmux(q3,v8,v5)
; CHECK-NEXT: v10 = vor(v24,v23)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.w = vasl(v21.w,r4)
; CHECK-NEXT: v3 = vmux(q0,v22,v3)
; CHECK-NEXT: v14 = vor(v18,v14)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r4)
; CHECK-NEXT: v3 = vor(v8,v3)
; CHECK-NEXT: v25 = vor(v10,v9)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v15.w = vasl(v15.w,r4)
; CHECK-NEXT: v2 = vor(v3,v2)
; CHECK-NEXT: v27 = vmux(q2,v5,v25)
; CHECK-NEXT: vmem(r1+#1) = v27.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.w = vasl(v4.w,r4)
; CHECK-NEXT: v29 = vmux(q3,v5,v2)
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v5.w)
; CHECK-NEXT: vmem(r1+#0) = v29.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28 = vor(v19,v26)
; CHECK-NEXT: v30 = vor(v14,v15)
; CHECK-NEXT: q3 = vcmp.eq(v6.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v5,v28)
; CHECK-NEXT: v31 = vmux(q3,v5,v30)
; CHECK-NEXT: vmem(r1+#3) = v0.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#2) = v31
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = sitofp <128 x i8> %v0 to <128 x float>
store <128 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input #1
define void @s8f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: v3:2.h = vunpack(v0.b)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vsplat(r0)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: r6 = #512
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r3)
; CHECK-NEXT: v3:2.w = vunpack(v2.h)
; CHECK-NEXT: v22 = vxor(v22,v22)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r6)
; CHECK-NEXT: r7 = ##-2147483648
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vsplat(r7)
; CHECK-NEXT: v4.w = vabs(v2.w)
; CHECK-NEXT: v5.w = vabs(v3.w)
; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12 = vsplat(r5)
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v11 = vmux(q0,v9,v22)
; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vcl0(v4.uw)
; CHECK-NEXT: v30 = vmux(q0,v9,v22)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uw = vcl0(v5.uw)
; CHECK-NEXT: v6.w = vadd(v6.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.w = vadd(v8.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v5.w,v8.w)
; CHECK-NEXT: v13 = vand(v4,v10)
; CHECK-NEXT: v14.w = vadd(v4.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vand(v5,v10)
; CHECK-NEXT: v7.w = vadd(v5.w,v7.w)
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw)
; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2)
; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w)
; CHECK-NEXT: v25 = vmux(q2,v1,v22)
; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
; CHECK-NEXT: v26 = vmux(q1,v22,v1)
; CHECK-NEXT: v27 = vmux(q3,v22,v1)
; CHECK-NEXT: v1 = vmux(q2,v1,v22)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT: v5.w = vadd(v14.w,v26.w)
; CHECK-NEXT: v29.w = vadd(v7.w,v27.w)
; CHECK-NEXT: v6.w = vsub(v25.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v1.w = vsub(v1.w,v8.w)
; CHECK-NEXT: v6.w = vadd(v6.w,v12.w)
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0)
; CHECK-NEXT: v1.w = vadd(v1.w,v12.w)
; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w)
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0)
; CHECK-NEXT: v5 = vmux(q1,v5,v28)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0)
; CHECK-NEXT: v5 = vor(v11,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v6.w,r4)
; CHECK-NEXT: v4 = vmux(q3,v4,v7)
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
; CHECK-NEXT: v4 = vor(v30,v4)
; CHECK-NEXT: v31 = vor(v5,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v4,v1)
; CHECK-NEXT: v0 = vmux(q3,v22,v31)
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vmux(q2,v22,v1)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v1.new
; CHECK-NEXT: }
%v0 = load <64 x i8>, ptr %a0, align 128
%v1 = sitofp <64 x i8> %v0 to <64 x float>
store <64 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input #2
define void @s8f32_2(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s8f32_2:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r0)
; CHECK-NEXT: v4 = vsplat(r3)
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vsplat(r2)
; CHECK-NEXT: v8 = vsplat(r4)
; CHECK-NEXT: v5.w = vabs(v0.w)
; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r7)
; CHECK-NEXT: r2 = #23
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vcl0(v5.uw)
; CHECK-NEXT: v30 = vmux(q2,v7,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v6.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v5.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vadd(v5.w,v1.w)
; CHECK-NEXT: v4 = vand(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT: v4 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v1.w,v4.w)
; CHECK-NEXT: v2.w = vsub(v2.w,v6.w)
; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0)
; CHECK-NEXT: v2.w = vadd(v2.w,v8.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r2)
; CHECK-NEXT: v1 = vmux(q3,v29,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v30,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i8>, ptr %a0, align 128
%v1 = sitofp <32 x i8> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; s16 -> f16
; No widening
define void @s16f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3:2 = combine(#64,#31)
; CHECK-NEXT: v1.h = vabs(v0.h)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vsplat(r6)
; CHECK-NEXT: v5.h = vsplat(r2)
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r3)
; CHECK-NEXT: r5:4 = combine(##32768,#5)
; CHECK-NEXT: v4.uh = vcl0(v1.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vsplat(r5)
; CHECK-NEXT: r2 = #10
; CHECK-NEXT: v4.h = vadd(v4.h,v3.h)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
; CHECK-NEXT: v6 = vand(v1,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4)
; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h)
; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
; CHECK-NEXT: v26 = vmux(q0,v2,v3)
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT: v3.h = vadd(v3.h,v5.h)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h)
; CHECK-NEXT: v30 = vmux(q1,v8,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
; CHECK-NEXT: v28.h = vsub(v3.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v28.h,r2)
; CHECK-NEXT: v3 = vmux(q2,v29,v27)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v30,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = sitofp <64 x i16> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; Widen input and result
define void @s16f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#31,#1)
; CHECK-NEXT: r7 = #64
; CHECK-NEXT: v1.h = vabs(v0.h)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r2)
; CHECK-NEXT: v5.h = vsplat(r3)
; CHECK-NEXT: r6 = #5
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r7)
; CHECK-NEXT: r4 = ##32768
; CHECK-NEXT: v4.uh = vcl0(v1.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vsplat(r4)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h)
; CHECK-NEXT: v4.h = vadd(v4.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v8,v3)
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
; CHECK-NEXT: v6 = vand(v1,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6)
; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h)
; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6)
; CHECK-NEXT: v26 = vmux(q1,v3,v2)
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
; CHECK-NEXT: v2.h = vadd(v2.h,v5.h)
; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v25.uh,r2)
; CHECK-NEXT: v28.h = vsub(v2.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v28.h,r4)
; CHECK-NEXT: q3 = vsetq(r7)
; CHECK-NEXT: v2 = vmux(q3,v29,v27)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vor(v30,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <32 x i16>, ptr %a0, align 128
%v1 = sitofp <32 x i16> %v0 to <32 x half>
store <32 x half> %v1, ptr %a1, align 128
ret void
}
; s16 -> f32
; No widening
define void @s16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r0)
; CHECK-NEXT: r7 = #512
; CHECK-NEXT: v4.w = vabs(v0.w)
; CHECK-NEXT: v6.w = vabs(v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: v9 = vsplat(r7)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
; CHECK-NEXT: r6 = ##-2147483648
; CHECK-NEXT: v7.uw = vcl0(v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r6)
; CHECK-NEXT: v8.uw = vcl0(v6.uw)
; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
; CHECK-NEXT: v7.w = vadd(v7.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v8.w = vadd(v8.w,v3.w)
; CHECK-NEXT: v27 = vmux(q0,v10,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v6.w,v8.w)
; CHECK-NEXT: v11.w = vadd(v4.w,v5.w)
; CHECK-NEXT: v12 = vand(v4,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v6.w,v5.w)
; CHECK-NEXT: v9 = vand(v6,v9)
; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w)
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2)
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w)
; CHECK-NEXT: v23 = vmux(q1,v2,v3)
; CHECK-NEXT: v14 = vmux(q2,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT: v11.w = vadd(v22.w,v23.w)
; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw)
; CHECK-NEXT: v25 = vmux(q3,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v5.w = vadd(v24.w,v25.w)
; CHECK-NEXT: v3 = vmux(q2,v3,v2)
; CHECK-NEXT: v7.w = vsub(v14.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v3.w = vsub(v3.w,v8.w)
; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w)
; CHECK-NEXT: v7.w = vadd(v7.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0)
; CHECK-NEXT: v3.w = vadd(v3.w,v13.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uw = vlsr(v11.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
; CHECK-NEXT: v4 = vmux(q3,v11,v4)
; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0)
; CHECK-NEXT: v28 = vmux(q3,v10,v2)
; CHECK-NEXT: v4 = vor(v27,v4)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vasl(v7.w,r4)
; CHECK-NEXT: v5 = vmux(q2,v5,v26)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,r4)
; CHECK-NEXT: v5 = vor(v28,v5)
; CHECK-NEXT: v29 = vor(v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v5,v3)
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v2,v3)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = sitofp <64 x i16> %v0 to <64 x float>
store <64 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input
define void @s16f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s16f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r0)
; CHECK-NEXT: v4 = vsplat(r2)
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v2.w = vabs(v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT: v1 = vxor(v1,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v5.uw = vcl0(v2.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v29 = vsplat(r7)
; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w)
; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #23
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v2.w,v4.w)
; CHECK-NEXT: v6 = vand(v2,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6)
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w)
; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6)
; CHECK-NEXT: v6 = vmux(q0,v1,v3)
; CHECK-NEXT: v3 = vmux(q1,v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v4.w,v6.w)
; CHECK-NEXT: v27.w = vsub(v3.w,v5.w)
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0)
; CHECK-NEXT: v2.w = vadd(v27.w,v7.w)
; CHECK-NEXT: v4 = vmux(q2,v29,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r2)
; CHECK-NEXT: v3 = vmux(q3,v30,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v4,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v1,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i16>, ptr %a0, align 128
%v1 = sitofp <32 x i16> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; s32 -> f16
; No widening
define void @s32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#8,#1)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v6.w = vabs(v1.w)
; CHECK-NEXT: v1.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r2)
; CHECK-NEXT: r4 = #512
; CHECK-NEXT: v5.w = vabs(v0.w)
; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vsplat(r4)
; CHECK-NEXT: v8 = vsplat(r6)
; CHECK-NEXT: v3.uw = vcl0(v6.uw)
; CHECK-NEXT: v20 = vxor(v20,v20)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v4.uw = vcl0(v5.uw)
; CHECK-NEXT: v3.w = vadd(v3.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27 = vsplat(r4)
; CHECK-NEXT: r5 = ##-2147483648
; CHECK-NEXT: v7.w = vadd(v4.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
; CHECK-NEXT: v6.w = vasl(v6.w,v3.w)
; CHECK-NEXT: q0 = vcmp.gt(v20.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v5.w,v7.w)
; CHECK-NEXT: v26 = vmux(q0,v13,v20)
; CHECK-NEXT: v10.w = vadd(v6.w,v8.w)
; CHECK-NEXT: v11 = vand(v6,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vand(v5,v9)
; CHECK-NEXT: q3 = vcmp.eq(v11.w,v20.w)
; CHECK-NEXT: v8.w = vadd(v5.w,v8.w)
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v10.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v10.uw,r3)
; CHECK-NEXT: q2 = vcmp.eq(v9.w,v20.w)
; CHECK-NEXT: v22 = vmux(q3,v20,v2)
; CHECK-NEXT: q3 = vcmp.gt(v5.uw,v8.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3)
; CHECK-NEXT: v9.w = vadd(v21.w,v22.w)
; CHECK-NEXT: v24 = vmux(q2,v20,v2)
; CHECK-NEXT: v23 = vmux(q1,v2,v20)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v6.uw,r3)
; CHECK-NEXT: v2 = vmux(q3,v2,v20)
; CHECK-NEXT: v25.w = vadd(v8.w,v24.w)
; CHECK-NEXT: v3.w = vsub(v23.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r3)
; CHECK-NEXT: v2.w = vsub(v2.w,v7.w)
; CHECK-NEXT: q3 = vcmp.eq(v12.w,v21.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: v6.uw = vlsr(v21.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v5.w,v8.w)
; CHECK-NEXT: v2.w = vadd(v2.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2)
; CHECK-NEXT: v6 = vmux(q3,v9,v6)
; CHECK-NEXT: q3 = vcmp.gt(v20.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uw = vlsr(v8.uw,r2)
; CHECK-NEXT: v30 = vmux(q3,v13,v20)
; CHECK-NEXT: v6 = vor(v26,v6)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v20.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,r3)
; CHECK-NEXT: v5 = vmux(q2,v28,v29)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v20.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r3)
; CHECK-NEXT: v31 = vor(v30,v5)
; CHECK-NEXT: v3 = vor(v6,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v31,v2)
; CHECK-NEXT: v3 = vmux(q2,v20,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v20,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v20.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v20.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.hf = v3:2.qf32
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.h = vdeal(v0.h)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i32>, ptr %a0, align 128
%v1 = sitofp <64 x i32> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; Widen result
define void @s32f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: v1.w = vabs(v0.w)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r6)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: r4 = #512
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: v6 = vsplat(r4)
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: r4 = ##-2147483648
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28 = vsplat(r5)
; CHECK-NEXT: v29 = vsplat(r4)
; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT: v31 = vmux(q3,v29,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT: v6 = vand(v1,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2)
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2)
; CHECK-NEXT: v27 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q3 = vsetq(r2)
; CHECK-NEXT: v5.w = vadd(v1.w,v27.w)
; CHECK-NEXT: v2.w = vsub(v2.w,v4.w)
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT: v2.w = vadd(v2.w,v28.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r3)
; CHECK-NEXT: v1 = vmux(q2,v30,v1)
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v31,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf)
; CHECK-NEXT: v0 = vor(v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v3,v0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.hf = v1:0.qf32
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.h = vdeal(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <32 x i32>, ptr %a0, align 128
%v1 = sitofp <32 x i32> %v0 to <32 x half>
store <32 x half> %v1, ptr %a1, align 128
ret void
}
; s32 -> f32
; No widening
define void @s32f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v1.w = vabs(v0.w)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r0)
; CHECK-NEXT: v5 = vsplat(r2)
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v4.w = vadd(v4.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v29 = vsplat(r7)
; CHECK-NEXT: r2 = #23
; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT: v6 = vand(v1,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w)
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT: v6 = vmux(q0,v2,v3)
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
; CHECK-NEXT: v27.w = vsub(v3.w,v4.w)
; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT: v4 = vmux(q2,v29,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r2)
; CHECK-NEXT: v3 = vmux(q3,v30,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v4,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i32>, ptr %a0, align 128
%v1 = sitofp <32 x i32> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input and result
define void @s32f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: s32f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v1.w = vabs(v0.w)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r0)
; CHECK-NEXT: v5 = vsplat(r2)
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v29 = vsplat(r7)
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
; CHECK-NEXT: v6 = vand(v1,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
; CHECK-NEXT: v6 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
; CHECK-NEXT: v27.w = vsub(v2.w,v4.w)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w)
; CHECK-NEXT: v4 = vmux(q3,v29,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
; CHECK-NEXT: q3 = vsetq(r2)
; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
; CHECK-NEXT: v2 = vmux(q2,v30,v28)
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vor(v4,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <16 x i32>, ptr %a0, align 128
%v1 = sitofp <16 x i32> %v0 to <16 x float>
store <16 x float> %v1, ptr %a1, align 128
ret void
}
; u8 -> f16
; No widening
define void @u8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3:2 = combine(#31,#5)
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vsplat(r6)
; CHECK-NEXT: v4.h = vsplat(r3)
; CHECK-NEXT: r5 = #64
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r5)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v5.uh = vcl0(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uh = vcl0(v1.uh)
; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v7.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vasl(v0.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.h = vasl(v1.h,v7.h)
; CHECK-NEXT: v10 = vand(v8,v6)
; CHECK-NEXT: v9.h = vadd(v8.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.h = vadd(v11.h,v4.h)
; CHECK-NEXT: v6 = vand(v11,v6)
; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh)
; CHECK-NEXT: q1 = vcmp.eq(v10.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2)
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v2.h)
; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh)
; CHECK-NEXT: v12 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2)
; CHECK-NEXT: v13 = vmux(q2,v2,v3)
; CHECK-NEXT: v25 = vmux(q0,v3,v2)
; CHECK-NEXT: v3 = vmux(q3,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2)
; CHECK-NEXT: v24.h = vadd(v9.h,v12.h)
; CHECK-NEXT: v3.h = vadd(v3.h,v4.h)
; CHECK-NEXT: v12.h = vadd(v25.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2)
; CHECK-NEXT: v13.h = vadd(v8.h,v13.h)
; CHECK-NEXT: v5.h = vsub(v12.h,v5.h)
; CHECK-NEXT: v3.h = vsub(v3.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6)
; CHECK-NEXT: q2 = vcmp.eq(v21.h,v9.h)
; CHECK-NEXT: q3 = vcmp.eq(v23.h,v8.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6)
; CHECK-NEXT: v4 = vmux(q2,v26,v14)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vasl(v5.h,r4)
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,r4)
; CHECK-NEXT: v29 = vor(v4,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v6,v3)
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v2,v3)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = uitofp <128 x i8> %v0 to <128 x half>
store <128 x half> %v1, ptr %a1, align 128
ret void
}
; Widen input
define void @u8f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3:2 = combine(#64,#31)
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vsplat(r6)
; CHECK-NEXT: v4.h = vsplat(r2)
; CHECK-NEXT: r5 = #5
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vsplat(r3)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v3.uh = vcl0(v0.uh)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vadd(v3.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vasl(v0.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5)
; CHECK-NEXT: v27 = vmux(q1,v2,v1)
; CHECK-NEXT: v1 = vmux(q0,v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vadd(v1.h,v4.h)
; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6)
; CHECK-NEXT: v1.h = vsub(v1.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,r4)
; CHECK-NEXT: v3 = vmux(q2,v30,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i8>, ptr %a0, align 128
%v1 = uitofp <64 x i8> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; u8 -> f32
; No widening
define void @u8f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #64
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: r6 = #512
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r0)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1 = valign(v0,v0,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v15 = vsplat(r6)
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: v3:2.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v31:30.uh = vunpack(v1.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3:2.uw = vunpack(v2.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1:0.uw = vunpack(v30.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vcl0(v2.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uw = vcl0(v0.uw)
; CHECK-NEXT: v5.w = vadd(v5.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uw = vcl0(v3.uw)
; CHECK-NEXT: v11.w = vadd(v7.w,v4.w)
; CHECK-NEXT: v7 = vxor(v7,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.uw = vcl0(v1.uw)
; CHECK-NEXT: v10.w = vadd(v8.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vsplat(r5)
; CHECK-NEXT: v14.w = vasl(v0.w,v11.w)
; CHECK-NEXT: v8.w = vadd(v9.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.w = vasl(v2.w,v5.w)
; CHECK-NEXT: v24 = vand(v14,v15)
; CHECK-NEXT: v20.w = vadd(v14.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.w = vasl(v3.w,v10.w)
; CHECK-NEXT: v19 = vand(v12,v15)
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w)
; CHECK-NEXT: v18.w = vadd(v12.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v16.w = vasl(v1.w,v8.w)
; CHECK-NEXT: v23 = vand(v13,v15)
; CHECK-NEXT: v22.w = vadd(v13.w,v6.w)
; CHECK-NEXT: q0 = vcmp.gt(v14.uw,v20.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v16.w,v6.w)
; CHECK-NEXT: v15 = vand(v16,v15)
; CHECK-NEXT: v30 = vmux(q3,v7,v4)
; CHECK-NEXT: q2 = vcmp.eq(v19.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v14.uw,r2)
; CHECK-NEXT: q3 = vcmp.eq(v15.w,v7.w)
; CHECK-NEXT: v28 = vmux(q0,v4,v7)
; CHECK-NEXT: q1 = vcmp.eq(v23.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uw = vlsr(v20.uw,r2)
; CHECK-NEXT: v26 = vmux(q3,v7,v4)
; CHECK-NEXT: v11.w = vsub(v28.w,v11.w)
; CHECK-NEXT: q3 = vcmp.gt(v13.uw,v22.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v15.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v20.w = vadd(v14.w,v30.w)
; CHECK-NEXT: v30 = vmux(q1,v7,v4)
; CHECK-NEXT: v31 = vmux(q2,v7,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uw = vlsr(v18.uw,r2)
; CHECK-NEXT: v29.w = vadd(v15.w,v26.w)
; CHECK-NEXT: q1 = vcmp.gt(v12.uw,v18.uw)
; CHECK-NEXT: v11.w = vadd(v11.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v22.uw,r2)
; CHECK-NEXT: v23.w = vadd(v19.w,v31.w)
; CHECK-NEXT: v22 = vmux(q3,v4,v7)
; CHECK-NEXT: q3 = vcmp.gt(v16.uw,v6.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v29.uw,r0)
; CHECK-NEXT: v31.w = vadd(v28.w,v30.w)
; CHECK-NEXT: v30 = vmux(q1,v4,v7)
; CHECK-NEXT: v4 = vmux(q3,v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v17.uw = vlsr(v12.uw,r2)
; CHECK-NEXT: v5.w = vsub(v30.w,v5.w)
; CHECK-NEXT: v29.w = vsub(v22.w,v10.w)
; CHECK-NEXT: v4.w = vsub(v4.w,v8.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.uw = vlsr(v13.uw,r2)
; CHECK-NEXT: v6.w = vadd(v29.w,v9.w)
; CHECK-NEXT: v5.w = vadd(v5.w,v9.w)
; CHECK-NEXT: q0 = vcmp.eq(v21.w,v14.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uw = vlsr(v16.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v17.w,v19.w)
; CHECK-NEXT: q3 = vcmp.eq(v13.w,v28.w)
; CHECK-NEXT: v4.w = vadd(v4.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v23.uw,r0)
; CHECK-NEXT: q1 = vcmp.eq(v25.w,v15.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uw = vlsr(v19.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31.uw = vlsr(v31.uw,r0)
; CHECK-NEXT: v23 = vmux(q2,v21,v23)
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v16.uw = vlsr(v28.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v15.uw,r0)
; CHECK-NEXT: v8 = vmux(q3,v31,v16)
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v6.w,r4)
; CHECK-NEXT: v22 = vmux(q1,v24,v26)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v5.w,r4)
; CHECK-NEXT: v6 = vor(v8,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uw = vlsr(v14.uw,r0)
; CHECK-NEXT: v25 = vor(v23,v5)
; CHECK-NEXT: v26 = vmux(q2,v7,v6)
; CHECK-NEXT: vmem(r1+#1) = v26.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.uw = vlsr(v20.uw,r0)
; CHECK-NEXT: v28 = vmux(q3,v7,v25)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w)
; CHECK-NEXT: vmem(r1+#0) = v28.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.w = vasl(v11.w,r4)
; CHECK-NEXT: v20 = vmux(q0,v20,v27)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.w = vasl(v4.w,r4)
; CHECK-NEXT: v29 = vor(v20,v11)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27 = vor(v22,v24)
; CHECK-NEXT: v31 = vmux(q3,v7,v29)
; CHECK-NEXT: vmem(r1+#2) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v7,v27)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#3) = v30.new
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = uitofp <128 x i8> %v0 to <128 x float>
store <128 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input #1
define void @u8f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #1
; CHECK-NEXT: r6 = #512
; CHECK-NEXT: v3:2.uh = vunpack(v0.ub)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vsplat(r7)
; CHECK-NEXT: v8 = vsplat(r6)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: v3:2.uw = vunpack(v2.uh)
; CHECK-NEXT: v21 = vxor(v21,v21)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vcl0(v2.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vcl0(v3.uw)
; CHECK-NEXT: v4.w = vadd(v4.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v5.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vasl(v2.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.w = vasl(v3.w,v5.w)
; CHECK-NEXT: v11 = vand(v7,v8)
; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
; CHECK-NEXT: q1 = vcmp.eq(v11.w,v21.w)
; CHECK-NEXT: v8 = vand(v9,v8)
; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.uw = vlsr(v10.uw,r2)
; CHECK-NEXT: v24 = vmux(q1,v21,v1)
; CHECK-NEXT: q3 = vcmp.eq(v8.w,v21.w)
; CHECK-NEXT: q1 = vcmp.gt(v9.uw,v6.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v25 = vmux(q0,v1,v21)
; CHECK-NEXT: v27 = vmux(q3,v21,v1)
; CHECK-NEXT: v1 = vmux(q1,v1,v21)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vsub(v25.w,v4.w)
; CHECK-NEXT: v1.w = vsub(v1.w,v5.w)
; CHECK-NEXT: v10.w = vadd(v22.w,v24.w)
; CHECK-NEXT: v28.w = vadd(v23.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
; CHECK-NEXT: v4.w = vadd(v4.w,v13.w)
; CHECK-NEXT: v1.w = vadd(v1.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v9.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v12.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uw = vlsr(v22.uw,r7)
; CHECK-NEXT: q3 = vcmp.eq(v26.w,v23.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v10.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uw = vlsr(v23.uw,r7)
; CHECK-NEXT: v5 = vmux(q2,v30,v11)
; CHECK-NEXT: q2 = vcmp.eq(v3.w,v21.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v28.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,r4)
; CHECK-NEXT: v6 = vmux(q3,v6,v29)
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v21.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
; CHECK-NEXT: v31 = vor(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v6,v1)
; CHECK-NEXT: v0 = vmux(q3,v21,v31)
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vmux(q2,v21,v1)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v1.new
; CHECK-NEXT: }
%v0 = load <64 x i8>, ptr %a0, align 128
%v1 = uitofp <64 x i8> %v0 to <64 x float>
store <64 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input #2
define void @u8f32_2(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u8f32_2:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r6)
; CHECK-NEXT: v4 = vsplat(r3)
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r5:4 = combine(##159,#8)
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vsplat(r2)
; CHECK-NEXT: v7 = vsplat(r5)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vcl0(v0.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v0.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vadd(v6.w,v1.w)
; CHECK-NEXT: v4 = vand(v6,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v1.uw)
; CHECK-NEXT: q1 = vcmp.eq(v4.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v1.uw = vlsr(v1.uw,r4)
; CHECK-NEXT: v4 = vmux(q1,v3,v2)
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
; CHECK-NEXT: v4.w = vadd(v1.w,v4.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uw = vlsr(v1.uw,r6)
; CHECK-NEXT: v2.w = vadd(v2.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r4)
; CHECK-NEXT: v1 = vmux(q2,v30,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i8>, ptr %a0, align 128
%v1 = uitofp <32 x i8> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; u16 -> f16
; No widening
define void @u16f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#64,#1)
; CHECK-NEXT: r5 = #31
; CHECK-NEXT: v1.uh = vcl0(v0.uh)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r2)
; CHECK-NEXT: v5.h = vsplat(r3)
; CHECK-NEXT: r4 = #5
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vsplat(r5)
; CHECK-NEXT: r3 = #10
; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4)
; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h)
; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4)
; CHECK-NEXT: v27 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2)
; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,r3)
; CHECK-NEXT: v2 = vmux(q2,v30,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = uitofp <64 x i16> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; Widen input and result
define void @u16f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#31,#1)
; CHECK-NEXT: r6 = #64
; CHECK-NEXT: v1.uh = vcl0(v0.uh)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r2)
; CHECK-NEXT: v4.h = vsplat(r3)
; CHECK-NEXT: r5 = #5
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vsplat(r6)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q3 = vsetq(r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h)
; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5)
; CHECK-NEXT: v5 = vmux(q1,v3,v2)
; CHECK-NEXT: v2 = vmux(q0,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
; CHECK-NEXT: v28.h = vadd(v7.h,v5.h)
; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2)
; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,r4)
; CHECK-NEXT: v2 = vmux(q1,v30,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <32 x i16>, ptr %a0, align 128
%v1 = uitofp <32 x i16> %v0 to <32 x half>
store <32 x half> %v1, ptr %a1, align 128
ret void
}
; u16 -> f32
; No widening
define void @u16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #1
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r7)
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r6 = #512
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8 = vsplat(r6)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v4.uw = vcl0(v0.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14 = vsplat(r5)
; CHECK-NEXT: v5.uw = vcl0(v1.uw)
; CHECK-NEXT: v4.w = vadd(v4.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vasl(v0.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.w = vasl(v1.w,v5.w)
; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
; CHECK-NEXT: v11 = vand(v7,v8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
; CHECK-NEXT: v8 = vand(v9,v8)
; CHECK-NEXT: q1 = vcmp.eq(v11.w,v2.w)
; CHECK-NEXT: q0 = vcmp.gt(v7.uw,v10.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v8.w,v2.w)
; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw)
; CHECK-NEXT: v20 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v22 = vmux(q2,v2,v3)
; CHECK-NEXT: v25 = vmux(q0,v3,v2)
; CHECK-NEXT: v3 = vmux(q3,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vsub(v25.w,v4.w)
; CHECK-NEXT: v3.w = vsub(v3.w,v5.w)
; CHECK-NEXT: v23.w = vadd(v19.w,v20.w)
; CHECK-NEXT: v10.w = vadd(v21.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
; CHECK-NEXT: v4.w = vadd(v4.w,v14.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v14.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.uw = vlsr(v19.uw,r7)
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v21.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v23.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7)
; CHECK-NEXT: v5 = vmux(q2,v26,v13)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,r4)
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,r4)
; CHECK-NEXT: v29 = vor(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v6,v3)
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v2,v3)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = uitofp <64 x i16> %v0 to <64 x float>
store <64 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input
define void @u16f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u16f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r2 = #255
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vsplat(r6)
; CHECK-NEXT: v4 = vsplat(r2)
; CHECK-NEXT: r3 = #512
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: r5:4 = combine(##159,#8)
; CHECK-NEXT: v3.uw = vcl0(v0.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r5)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4)
; CHECK-NEXT: v5 = vmux(q1,v2,v1)
; CHECK-NEXT: v1 = vmux(q0,v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6)
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
; CHECK-NEXT: v3 = vmux(q2,v3,v30)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v2,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i16>, ptr %a0, align 128
%v1 = uitofp <32 x i16> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; u32 -> f16
; No widening
define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f16_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#8,#1)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v3.uw = vcl0(v0.uw)
; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r2)
; CHECK-NEXT: r4 = #512
; CHECK-NEXT: v4.uw = vcl0(v1.uw)
; CHECK-NEXT: v1.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v6 = vsplat(r6)
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v9 = vxor(v9,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r4)
; CHECK-NEXT: v5.w = vasl(v1.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.w = vasl(v0.w,v3.w)
; CHECK-NEXT: v11.w = vadd(v5.w,v6.w)
; CHECK-NEXT: v13 = vand(v5,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vadd(v8.w,v6.w)
; CHECK-NEXT: v7 = vand(v8,v7)
; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v11.uw)
; CHECK-NEXT: q2 = vcmp.eq(v13.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3)
; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw)
; CHECK-NEXT: q0 = vcmp.eq(v7.w,v9.w)
; CHECK-NEXT: v28 = vmux(q2,v9,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3)
; CHECK-NEXT: v29 = vmux(q1,v2,v9)
; CHECK-NEXT: v30 = vmux(q3,v2,v9)
; CHECK-NEXT: v2 = vmux(q0,v9,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vsub(v29.w,v4.w)
; CHECK-NEXT: v7.w = vadd(v27.w,v28.w)
; CHECK-NEXT: v3.w = vsub(v30.w,v3.w)
; CHECK-NEXT: v2.w = vadd(v6.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3)
; CHECK-NEXT: v4.w = vadd(v4.w,v10.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v10.w)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3)
; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2)
; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2)
; CHECK-NEXT: v5 = vmux(q3,v7,v5)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,r3)
; CHECK-NEXT: v31 = vmux(q1,v2,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v3.w,r3)
; CHECK-NEXT: v4 = vor(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v31,v2)
; CHECK-NEXT: v3 = vmux(q2,v9,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v9,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v9.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.hf = v3:2.qf32
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.h = vdeal(v0.h)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <64 x i32>, ptr %a0, align 128
%v1 = uitofp <64 x i32> %v0 to <64 x half>
store <64 x half> %v1, ptr %a1, align 128
ret void
}
; Widen result
define void @u32f16_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f16_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##512,#1)
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r2)
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r6)
; CHECK-NEXT: r5 = #8
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v1.w = vadd(v1.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w)
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT: v5 = vmux(q0,v2,v3)
; CHECK-NEXT: v3 = vmux(q1,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vsub(v3.w,v1.w)
; CHECK-NEXT: v30.w = vadd(v4.w,v5.w)
; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
; CHECK-NEXT: q3 = vsetq(r2)
; CHECK-NEXT: v3 = vmux(q1,v3,v31)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf)
; CHECK-NEXT: v0 = vor(v3,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v2,v0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.hf = v1:0.qf32
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.h = vdeal(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <32 x i32>, ptr %a0, align 128
%v1 = uitofp <32 x i32> %v0 to <32 x half>
store <32 x half> %v1, ptr %a1, align 128
ret void
}
; u32 -> f32
; No widening
define void @u32f32_0(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f32_0:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##512,#1)
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r2)
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r6)
; CHECK-NEXT: r5 = #8
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT: v5 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
; CHECK-NEXT: v2 = vmux(q2,v2,v30)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v3,v31)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
%v0 = load <32 x i32>, ptr %a0, align 128
%v1 = uitofp <32 x i32> %v0 to <32 x float>
store <32 x float> %v1, ptr %a1, align 128
ret void
}
; Widen input and result
define void @u32f32_1(ptr %a0, ptr %a1) #0 {
; CHECK-LABEL: u32f32_1:
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##512,#1)
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
; CHECK-NEXT: v0.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r2)
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r6)
; CHECK-NEXT: r5 = #8
; CHECK-NEXT: r4 = #159
; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT: v5 = vand(v6,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
; CHECK-NEXT: v5 = vmux(q0,v3,v2)
; CHECK-NEXT: v2 = vmux(q1,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r3)
; CHECK-NEXT: q3 = vsetq(r2)
; CHECK-NEXT: v2 = vmux(q1,v2,v30)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31 = vor(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q2,v3,v31)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
; CHECK-NEXT: }
%v0 = load <16 x i32>, ptr %a0, align 128
%v1 = uitofp <16 x i32> %v0 to <16 x float>
store <16 x float> %v1, ptr %a1, align 128
ret void
}
attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }