; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx,+mmx | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx,+mmx | FileCheck %s --check-prefix=X64
define <4 x i64> @A(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: A:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: A:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i64, ptr %ptr, align 8
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <4 x i64> @A2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: A2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: vbroadcastsd (%ecx), %ymm0
; X86-NEXT: movl %edx, (%eax)
; X86-NEXT: movl %esi, 4(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: A2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i64, ptr %ptr, align 8
store i64 %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <8 x i32> @B(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: B:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: B:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, ptr %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
ret <8 x i32> %vecinit6.i
}
define <8 x i32> @B2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: B2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: B2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load i32, ptr %ptr, align 4
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <8 x i32> @B3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: B3:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: B3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
%q = load i32, ptr %ptr, align 4
store i32 %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
%vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
%vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
%vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
%vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
ret <8 x i32> %vecinit14.i
}
define <4 x double> @C(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: C:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: C:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load double, ptr %ptr, align 8
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <4 x double> @C2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: C2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastsd (%ecx), %ymm0
; X86-NEXT: vmovlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: C2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, ptr %ptr, align 8
store double %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
define <8 x float> @D(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: D:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: D:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
ret <8 x float> %vecinit6.i
}
define <8 x float> @D2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: D2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: D2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
define <8 x float> @D3(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: D3:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx), %ymm0
; X86-NEXT: vmovss %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: D3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
store float %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
%vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
%vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
%vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
%vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
ret <8 x float> %vecinit14.i
}
;;;; 128-bit versions
define <4 x float> @e(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: e:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: e:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x float> @e2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: e2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx), %xmm0
; X86-NEXT: vmovss %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
store float %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
ret <4 x float> %vecinit6.i
}
; Don't broadcast constants on pre-AVX2 hardware.
define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: _e2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X86-NEXT: retl
;
; X64-LABEL: _e2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
; X64-NEXT: retq
entry:
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
ret <4 x float> %vecinit6.i
}
define <4 x i32> @F(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: F:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: F:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load i32, ptr %ptr, align 4
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
define <4 x i32> @F2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: F2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-NEXT: retl
;
; X64-LABEL: F2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
entry:
%q = load i32, ptr %ptr, align 4
store i32 %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
%vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
%vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
ret <4 x i32> %vecinit6.i
}
; FIXME: Pointer adjusted broadcasts
define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4i32_4i32_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4i32_4i32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, ptr %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %ret
}
define <8 x i32> @load_splat_8i32_4i32_33333333(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_8i32_4i32_33333333:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss 12(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, ptr %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x i32> %ret
}
define <8 x i32> @load_splat_8i32_8i32_55555555(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_8i32_8i32_55555555:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss 20(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_8i32_8i32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x i32>, ptr %ptr
%ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i32> %ret
}
define <4 x float> @load_splat_4f32_4f32_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4f32_4f32_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss 4(%eax), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4f32_4f32_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, ptr %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x float> %ret
}
define <8 x float> @load_splat_8f32_4f32_33333333(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_8f32_4f32_33333333:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss 12(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_8f32_4f32_33333333:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x float>, ptr %ptr
%ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x float> %ret
}
define <8 x float> @load_splat_8f32_8f32_55555555(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_8f32_8f32_55555555:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss 20(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_8f32_8f32_55555555:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <8 x float>, ptr %ptr
%ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x float> %ret
}
define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_2i64_2i64_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X86-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, ptr %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %ret
}
define <4 x i64> @load_splat_4i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4i64_2i64_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd 8(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, ptr %ptr
%ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i64> %ret
}
define <4 x i64> @load_splat_4i64_4i64_2222(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4i64_4i64_2222:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd 16(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4i64_4i64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i64>, ptr %ptr
%ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i64> %ret
}
define <2 x double> @load_splat_2f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_2f64_2f64_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, ptr %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %ret
}
define <4 x double> @load_splat_4f64_2f64_1111(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4f64_2f64_1111:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd 8(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4f64_2f64_1111:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x double>, ptr %ptr
%ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x double> %ret
}
define <4 x double> @load_splat_4f64_4f64_2222(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: load_splat_4f64_4f64_2222:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd 16(%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: load_splat_4f64_4f64_2222:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x double>, ptr %ptr
%ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x double> %ret
}
; Unsupported vbroadcasts
define <2 x i64> @G(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: G:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: retl
;
; X64-LABEL: G:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load i64, ptr %ptr, align 8
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <2 x i64> @G2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: G2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: movl %edx, (%eax)
; X86-NEXT: movl %esi, 4(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: G2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
entry:
%q = load i64, ptr %ptr, align 8
store i64 %q, ptr %ptr2, align 8 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
ret <2 x i64> %vecinit2.i
}
define <4 x i32> @H(<4 x i32> %a) {
; X86-LABEL: H:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X86-NEXT: retl
;
; X64-LABEL: H:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
ret <4 x i32> %x
}
define <2 x double> @I(ptr %ptr) nounwind uwtable readnone ssp {
; X86-LABEL: I:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: retl
;
; X64-LABEL: I:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%q = load double, ptr %ptr, align 4
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <2 x double> @I2(ptr %ptr, ptr %ptr2) nounwind uwtable readnone ssp {
; X86-LABEL: I2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: vmovlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: I2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: vmovlps %xmm0, (%rsi)
; X64-NEXT: retq
entry:
%q = load double, ptr %ptr, align 4
store double %q, ptr %ptr2, align 4 ; to create a chain to prevent broadcast
%vecinit.i = insertelement <2 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
ret <2 x double> %vecinit2.i
}
define <4 x float> @_RR(ptr %ptr, ptr %k) nounwind uwtable readnone ssp {
; X86-LABEL: _RR:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx), %xmm0
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: movl %eax, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: _RR:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: movl %eax, (%rax)
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
%vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
%vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
; force a chain
%j = load i32, ptr %k, align 4
store i32 %j, ptr undef
ret <4 x float> %vecinit6.i
}
define <4 x float> @_RR2(ptr %ptr, ptr %k) nounwind uwtable readnone ssp {
; X86-LABEL: _RR2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: _RR2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
%q = load float, ptr %ptr, align 4
%v = insertelement <4 x float> undef, float %q, i32 0
%t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %t
}
; These tests check that a vbroadcast instruction is used when we have a splat
; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
; (via the insertelements).
define <8 x float> @splat_concat1(ptr %p) {
; X86-LABEL: splat_concat1:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: splat_concat1:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, ptr %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %6
}
define <8 x float> @splat_concat2(ptr %p) {
; X86-LABEL: splat_concat2:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastss (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: splat_concat2:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, ptr %p, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = insertelement <4 x float> %2, float %1, i32 1
%4 = insertelement <4 x float> %3, float %1, i32 2
%5 = insertelement <4 x float> %4, float %1, i32 3
%6 = insertelement <4 x float> undef, float %1, i32 0
%7 = insertelement <4 x float> %6, float %1, i32 1
%8 = insertelement <4 x float> %7, float %1, i32 2
%9 = insertelement <4 x float> %8, float %1, i32 3
%10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %10
}
define <4 x double> @splat_concat3(ptr %p) {
; X86-LABEL: splat_concat3:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: splat_concat3:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, ptr %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x double> %4
}
define <4 x double> @splat_concat4(ptr %p) {
; X86-LABEL: splat_concat4:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: splat_concat4:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, ptr %p, align 8
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = insertelement <2 x double> %2, double %1, i32 1
%4 = insertelement <2 x double> undef, double %1, i32 0
%5 = insertelement <2 x double> %2, double %1, i32 1
%6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %6
}
; PR34041
define <4 x double> @broadcast_shuffle_1000(ptr %p) {
; X86-LABEL: broadcast_shuffle_1000:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: broadcast_shuffle_1000:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, ptr %p
%2 = insertelement <2 x double> undef, double %1, i32 0
%3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x double> %3
}
define <4 x double> @broadcast_shuffle1032(ptr %p) {
; X86-LABEL: broadcast_shuffle1032:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vbroadcastsd (%eax), %ymm0
; X86-NEXT: retl
;
; X64-LABEL: broadcast_shuffle1032:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, ptr %p
%2 = insertelement <2 x double> undef, double %1, i32 1
%3 = insertelement <2 x double> undef, double %1, i32 0
%4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x double> %4
}
define void @broadcast_v16i32(ptr %a, ptr %b) {
; X86-LABEL: broadcast_v16i32:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx), %ymm0
; X86-NEXT: vmovups %ymm0, 32(%eax)
; X86-NEXT: vmovups %ymm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: broadcast_v16i32:
; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: vmovups %ymm0, 32(%rsi)
; X64-NEXT: vmovups %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%1 = load i32, ptr %a, align 4
%2 = insertelement <8 x i32> undef, i32 %1, i32 0
%3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
%4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i32> %4, ptr %b, align 4
ret void
}
;
; Broadcast scale factor for xyz vector - slp will have vectorized xy.
;
define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonly) nounwind {
; X86-LABEL: broadcast_scale_xyz:
; X86: ## %bb.0:
; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: vmulpd (%eax), %xmm0, %xmm1
; X86-NEXT: vmulsd 16(%eax), %xmm0, %xmm0
; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovsd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
;
; X64-LABEL: broadcast_scale_xyz:
; X64: ## %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1
; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0
; X64-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%3 = load <2 x double>, ptr %1, align 8
%4 = getelementptr inbounds double, ptr %1, i64 2
%5 = load double, ptr %4, align 8
%6 = load double, ptr %0, align 8
%7 = insertelement <2 x double> undef, double %6, i32 0
%8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
%9 = fmul <2 x double> %3, %8
%10 = fmul double %5, %6
%11 = extractelement <2 x double> %9, i32 0
%12 = extractelement <2 x double> %9, i32 1
%13 = fadd double %11, %12
%14 = fadd double %10, %13
ret double %14
}
;
; Broadcast v2f32 non-uniform constant via vmovddup
;
define void @fmul_by_v2f32_broadcast() nounwind {
; X86-LABEL: fmul_by_v2f32_broadcast:
; X86: ## %bb.0:
; X86-NEXT: vmovddup {{.*#+}} xmm0 = [3.1E+1,0.0E+0,3.1E+1,0.0E+0]
; X86-NEXT: ## xmm0 = mem[0,0]
; X86-NEXT: ## implicit-def: $xmm1
; X86-NEXT: .p2align 4
; X86-NEXT: LBB42_1: ## =>This Inner Loop Header: Depth=1
; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; X86-NEXT: vmulps %xmm0, %xmm2, %xmm2
; X86-NEXT: vmovlps %xmm2, (%eax)
; X86-NEXT: vmulps %xmm0, %xmm1, %xmm1
; X86-NEXT: vmovlps %xmm1, (%eax)
; X86-NEXT: jmp LBB42_1
;
; X64-LABEL: fmul_by_v2f32_broadcast:
; X64: ## %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = [3.1E+1,0.0E+0,3.1E+1,0.0E+0]
; X64-NEXT: ## xmm0 = mem[0,0]
; X64-NEXT: ## implicit-def: $xmm1
; X64-NEXT: .p2align 4
; X64-NEXT: LBB42_1: ## =>This Inner Loop Header: Depth=1
; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; X64-NEXT: vmulps %xmm0, %xmm2, %xmm2
; X64-NEXT: vmovlps %xmm2, (%rax)
; X64-NEXT: vmulps %xmm0, %xmm1, %xmm1
; X64-NEXT: vmovlps %xmm1, (%rax)
; X64-NEXT: jmp LBB42_1
br label %1
1:
%2 = phi <2 x float> [ undef, %0 ], [ %5, %1 ]
%3 = load <2 x float>, ptr poison, align 8
%4 = fmul <2 x float> %3, <float 3.100000e+01, float 0.000000e+00>
store <2 x float> %4, ptr poison, align 8
%5 = fmul <2 x float> %2, <float 3.100000e+01, float 0.000000e+00>
store <2 x float> %5, ptr poison, align 8
br label %1
}
;
; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
;
define float @broadcast_lifetime() nounwind {
; X86-LABEL: broadcast_lifetime:
; X86: ## %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $40, %esp
; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: calll _gfunc
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %esi, (%esp)
; X86-NEXT: calll _gfunc
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: addl $40, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: broadcast_lifetime:
; X64: ## %bb.0:
; X64-NEXT: subq $40, %rsp
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: callq _gfunc
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
; X64-NEXT: addq $40, %rsp
; X64-NEXT: retq
%1 = alloca <4 x float>, align 16
%2 = alloca <4 x float>, align 16
call void @llvm.lifetime.start.p0(i64 16, ptr %1)
call void @gfunc(ptr %1)
%3 = load <4 x float>, ptr %1, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %1)
call void @llvm.lifetime.start.p0(i64 16, ptr %2)
call void @gfunc(ptr %2)
%4 = load <4 x float>, ptr %2, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %2)
%5 = extractelement <4 x float> %3, i32 1
%6 = extractelement <4 x float> %4, i32 1
%7 = fsub float %6, %5
ret float %7
}
define <8 x i16> @broadcast_x86_mmx(<1 x i64> %tmp) nounwind {
; X86-LABEL: broadcast_x86_mmx:
; X86: ## %bb.0: ## %bb
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X86-NEXT: retl
;
; X64-LABEL: broadcast_x86_mmx:
; X64: ## %bb.0: ## %bb
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
bb:
%tmp1 = bitcast <1 x i64> %tmp to i64
%tmp2 = insertelement <2 x i64> undef, i64 %tmp1, i32 0
%tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x i16> %tmp4
}
declare void @gfunc(ptr)
declare void @llvm.lifetime.start.p0(i64, ptr)
declare void @llvm.lifetime.end.p0(i64, ptr)