Example code:
function dotf(a,b)
s = 0.0
@inbounds @simd for i ∈ eachindex(a)
s += a[i] * b[i]
end
s
end
function dotw(a,b)
s = 0.0; ptra = pointer(a); ptrb = pointer(b); n = 0; N = length(a)
while n < N
@fastmath s += unsafe_load(ptra + sizeof(eltype(a))*n) * unsafe_load(ptrb + sizeof(eltype(b))*n)
n += 1
end
s
end
a = rand(400); b = rand(400);
@code_native debuginfo=:none dotf(a,b)
@code_native debuginfo=:none dotw(a,b)
@code_llvm debuginfo=:none dotf(a,b)
@code_llvm debuginfo=:none dotw(a,b)
With version (I also tested with 8.0.1, getting the same result):
julia> versioninfo()
Julia Version 1.4.0-DEV.597
Commit e12b054* (2019-12-13 09:36 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.0 (ORCJIT, haswell)
Environment:
JULIA_NUM_THREADS = 24
this yields
#julia> @code_native debuginfo=:none dotf(a,b)
.text
movq 24(%rdi), %rax
testq %rax, %rax
jle L44
movq %rax, %rcx
sarq $63, %rcx
andnq %rax, %rcx, %rax
movq (%rdi), %rcx
movq (%rsi), %rdx
cmpq $16, %rax
jae L49
vxorpd %xmm0, %xmm0, %xmm0
xorl %esi, %esi
jmp L192
L44:
vxorps %xmm0, %xmm0, %xmm0
retq
L49:
movabsq $9223372036854775792, %rsi # imm = 0x7FFFFFFFFFFFFFF0
andq %rax, %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
L80:
vmovupd (%rcx,%rdi,8), %ymm4
vmovupd 32(%rcx,%rdi,8), %ymm5
vmovupd 64(%rcx,%rdi,8), %ymm6
vmovupd 96(%rcx,%rdi,8), %ymm7
vmulpd (%rdx,%rdi,8), %ymm4, %ymm4
vaddpd %ymm4, %ymm0, %ymm0
vmulpd 32(%rdx,%rdi,8), %ymm5, %ymm4
vaddpd %ymm4, %ymm1, %ymm1
vmulpd 64(%rdx,%rdi,8), %ymm6, %ymm4
vmulpd 96(%rdx,%rdi,8), %ymm7, %ymm5
vaddpd %ymm4, %ymm2, %ymm2
vaddpd %ymm5, %ymm3, %ymm3
addq $16, %rdi
cmpq %rdi, %rsi
jne L80
vaddpd %ymm0, %ymm1, %ymm0
vaddpd %ymm0, %ymm2, %ymm0
vaddpd %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %xmm1, %xmm0, %xmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddsd %xmm1, %xmm0, %xmm0
cmpq %rsi, %rax
je L211
nopl (%rax)
L192:
vmovsd (%rcx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
vfmadd231sd (%rdx,%rsi,8), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0
incq %rsi
cmpq %rax, %rsi
jb L192
L211:
vzeroupper
retq
nopw (%rax,%rax)
#julia> @code_native debuginfo=:none dotw(a,b)
.text
movq 8(%rdi), %rax
testq %rax, %rax
jle L32
movq (%rdi), %rcx
movq (%rsi), %rdx
cmpq $16, %rax
jae L37
vxorpd %xmm0, %xmm0, %xmm0
xorl %esi, %esi
jmp L176
L32:
vxorps %xmm0, %xmm0, %xmm0
retq
L37:
movq %rax, %rsi
andq $-16, %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nop
L64:
vmovupd (%rdx,%rdi,8), %ymm4
vmovupd 32(%rdx,%rdi,8), %ymm5
vmovupd 64(%rdx,%rdi,8), %ymm6
vmovupd 96(%rdx,%rdi,8), %ymm7
vmulpd (%rcx,%rdi,8), %ymm4, %ymm4
vaddpd %ymm0, %ymm4, %ymm0
vmulpd 32(%rcx,%rdi,8), %ymm5, %ymm4
vaddpd %ymm1, %ymm4, %ymm1
vmulpd 64(%rcx,%rdi,8), %ymm6, %ymm4
vmulpd 96(%rcx,%rdi,8), %ymm7, %ymm5
vaddpd %ymm2, %ymm4, %ymm2
vaddpd %ymm3, %ymm5, %ymm3
addq $16, %rdi
cmpq %rdi, %rsi
jne L64
vaddpd %ymm0, %ymm1, %ymm0
vaddpd %ymm0, %ymm2, %ymm0
vaddpd %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %xmm1, %xmm0, %xmm0
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddsd %xmm1, %xmm0, %xmm0
cmpq %rsi, %rax
je L195
nopl (%rax)
L176:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0 # xmm0 = (xmm1 * mem) + xmm0
incq %rsi
cmpq %rsi, %rax
jne L176
L195:
vzeroupper
retq
nopw (%rax,%rax)
The llvm clearly shows the contract/fast attributes, in the case of the @simd for:
%30 = fmul contract <4 x double> %wide.load, %wide.load16
%31 = fmul contract <4 x double> %wide.load13, %wide.load17
%32 = fmul contract <4 x double> %wide.load14, %wide.load18
%33 = fmul contract <4 x double> %wide.load15, %wide.load19
%34 = fadd fast <4 x double> %vec.phi, %30
%35 = fadd fast <4 x double> %vec.phi10, %31
%36 = fadd fast <4 x double> %vec.phi11, %32
%37 = fadd fast <4 x double> %vec.phi12, %33
and the @fastmath while loop:
%31 = fmul fast <4 x double> %wide.load14, %wide.load
%32 = fmul fast <4 x double> %wide.load15, %wide.load11
%33 = fmul fast <4 x double> %wide.load16, %wide.load12
%34 = fmul fast <4 x double> %wide.load17, %wide.load13
%35 = fadd fast <4 x double> %31, %vec.phi
%36 = fadd fast <4 x double> %32, %vec.phi8
%37 = fadd fast <4 x double> %33, %vec.phi9
%38 = fadd fast <4 x double> %34, %vec.phi10
Yet neither case shows a contraction.
With
julia> versioninfo()
Julia Version 1.3.1-pre.0
Commit b42f4ab* (2019-11-26 17:58 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, haswell)
Environment:
JULIA_NUM_THREADS = 24
we get
julia> @code_native debuginfo=:none dotf(a,b)
.text
movq 24(%rdi), %rax
testq %rax, %rax
jle L32
movq (%rdi), %rcx
movq (%rsi), %rdx
cmpq $16, %rax
jae L40
vxorpd %xmm0, %xmm0, %xmm0
xorl %esi, %esi
jmp L176
L32:
vxorps %xmm0, %xmm0, %xmm0
vzeroupper
retq
L40:
movq %rax, %rsi
andq $-16, %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopw %cs:(%rax,%rax)
L80:
vmovupd (%rcx,%rdi,8), %ymm4
vmovupd 32(%rcx,%rdi,8), %ymm5
vmovupd 64(%rcx,%rdi,8), %ymm6
vmovupd 96(%rcx,%rdi,8), %ymm7
vfmadd231pd (%rdx,%rdi,8), %ymm4, %ymm0
vfmadd231pd 32(%rdx,%rdi,8), %ymm5, %ymm1
vfmadd231pd 64(%rdx,%rdi,8), %ymm6, %ymm2
vfmadd231pd 96(%rdx,%rdi,8), %ymm7, %ymm3
addq $16, %rdi
cmpq %rdi, %rsi
jne L80
vaddpd %ymm0, %ymm1, %ymm0
vaddpd %ymm0, %ymm2, %ymm0
vaddpd %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %ymm1, %ymm0, %ymm0
vhaddpd %ymm0, %ymm0, %ymm0
cmpq %rsi, %rax
je L196
nopw (%rax,%rax)
L176:
vmovsd (%rcx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
vfmadd231sd (%rdx,%rsi,8), %xmm1, %xmm0
addq $1, %rsi
cmpq %rax, %rsi
jb L176
L196:
vzeroupper
retq
nopl (%rax,%rax)
julia> @code_native debuginfo=:none dotw(a,b)
.text
movq 8(%rdi), %rax
testq %rax, %rax
jle L32
movq (%rdi), %rcx
movq (%rsi), %rdx
cmpq $16, %rax
jae L40
vxorpd %xmm0, %xmm0, %xmm0
xorl %esi, %esi
jmp L176
L32:
vxorps %xmm0, %xmm0, %xmm0
vzeroupper
retq
L40:
movq %rax, %rsi
andq $-16, %rsi
vxorpd %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vxorpd %xmm1, %xmm1, %xmm1
vxorpd %xmm2, %xmm2, %xmm2
vxorpd %xmm3, %xmm3, %xmm3
nopw %cs:(%rax,%rax)
L80:
vmovupd (%rdx,%rdi,8), %ymm4
vmovupd 32(%rdx,%rdi,8), %ymm5
vmovupd 64(%rdx,%rdi,8), %ymm6
vmovupd 96(%rdx,%rdi,8), %ymm7
vfmadd231pd (%rcx,%rdi,8), %ymm4, %ymm0
vfmadd231pd 32(%rcx,%rdi,8), %ymm5, %ymm1
vfmadd231pd 64(%rcx,%rdi,8), %ymm6, %ymm2
vfmadd231pd 96(%rcx,%rdi,8), %ymm7, %ymm3
addq $16, %rdi
cmpq %rdi, %rsi
jne L80
vaddpd %ymm0, %ymm1, %ymm0
vaddpd %ymm0, %ymm2, %ymm0
vaddpd %ymm0, %ymm3, %ymm0
vextractf128 $1, %ymm0, %xmm1
vaddpd %ymm1, %ymm0, %ymm0
vhaddpd %ymm0, %ymm0, %ymm0
cmpq %rsi, %rax
je L196
nopw (%rax,%rax)
L176:
vmovsd (%rdx,%rsi,8), %xmm1 # xmm1 = mem[0],zero
vfmadd231sd (%rcx,%rsi,8), %xmm1, %xmm0
addq $1, %rsi
cmpq %rsi, %rax
jne L176
L196:
vzeroupper
retq
nopl (%rax,%rax)
Both versions show essentially the same @code_llvm, so I'd guess something changed on the LLVM-side? Maybe the createCombineMulAddPass changed somehow?
I'm not sure it's an LLVM regression, because Clang 9.0.0 doesn't seem to have this issue.
The fact that the fmul is marked contract means that CombineMulAddPass did run correctly, since this is a backend optimization.
Clang set's the function attribute unsafe-fp-math=true when compiling with -Ofast, setting that for the Julia version causes
contraction to happen and removing it from the Clang produced IR stops it. https://godbolt.org/z/jBjkU5
Looking at DagCombiner it seems that both need to be contract for the fusion to happen without unsafe-fp-math=true.
Which is rather odd since fast is supposed to imply contract https://llvm.org/docs/LangRef.html#fast-math-flags
; │┌ @ float.jl:405 within `*'
%29 = fmul contract <4 x double> %wide.load, %wide.load16
%30 = fmul contract <4 x double> %wide.load13, %wide.load17
%31 = fmul contract <4 x double> %wide.load14, %wide.load18
%32 = fmul contract <4 x double> %wide.load15, %wide.load19
; │└
; │┌ @ float.jl:401 within `+'
%33 = fadd contract <4 x double> %vec.phi, %29
%34 = fadd contract <4 x double> %vec.phi10, %30
%35 = fadd contract <4 x double> %vec.phi11, %31
%36 = fadd contract <4 x double> %vec.phi12, %32
; │└
See https://godbolt.org/z/whUcJK and switching from llc 6.0.0 to llc 7.0.0
Interesting.
So, I take it that @code_llvm optimize=true shows llvm after all the passes in the jitlayers have run, and because profitability of combining is target-dependent, this waits until back end specific lowering?
Even though some backend-specifics (vector width) already show up?
fast not implying contract sounds like an llvm bug. Are you filing a bug report with LLVM (the documentation doesn't seem ambiguous in stating the fast "flag implies all of the others."), or getting Julia to add the contract flag with @simd (and also with @fastmath?)?
I'm wondering if I should update SIMDPirates.jl to use the contract flag in places where I currently used fast.
Yes code_llvm optimize=false is what Julia emits, and code_llvm optimize=true is after the (mostly target independent) middle end, which includes everything we setup in jitlayers.cpp.
I am working on localising the bug within LLVM, since fast implying contract is an assumption several places have so it is likely a place where things weren't copied/setup right.
I'm wondering if I should update SIMDPirates.jl to use the contract flag in places where I currently used fast.
No.
Open a patch against upstream https://reviews.llvm.org/D71495
Most helpful comment
Open a patch against upstream https://reviews.llvm.org/D71495